60 lines
2.3 KiB
Python
60 lines
2.3 KiB
Python
import re
|
||
import csv
|
||
from pathlib import Path
|
||
|
||
SRC = Path(r"d:\Github\docs\config\LanguageDataTwo.csv")
|
||
if not SRC.exists():
|
||
print(f"Source not found: {SRC}")
|
||
raise SystemExit(1)
|
||
|
||
# 备份原文件
|
||
bak = SRC.with_suffix(SRC.suffix + ".bak")
|
||
if not bak.exists():
|
||
bak.write_bytes(SRC.read_bytes())
|
||
|
||
out_rows = []
|
||
with SRC.open("r", encoding="utf-8") as f:
|
||
for raw in f:
|
||
line = raw.rstrip("\n")
|
||
if not line.strip():
|
||
continue
|
||
# 跳过首部注释行(以#开头)
|
||
if line.lstrip().startswith("#"):
|
||
continue
|
||
# 将连续两个或以上空格替换为制表符,便于分割(原文件里有制表或多空格分隔)
|
||
norm = re.sub(r"[ \u00A0]{2,}", "\t", line)
|
||
# 以制表符分割(如果没有制表符,会退化为按任意空格分割)
|
||
parts = [p.strip() for p in re.split(r"\t+", norm)]
|
||
# 如果分割后超过4列,则把第0,1固定,剩下的尽量合并为 English 和 Chinese 两列
|
||
if len(parts) >= 4:
|
||
id_, key = parts[0], parts[1]
|
||
# English 可能包含制表/空格,取中间所有并把最后一列当作中文
|
||
eng = " ".join(parts[2:-1]).strip()
|
||
chi = parts[-1].strip()
|
||
out_rows.append((id_, key, eng, chi))
|
||
elif len(parts) == 3:
|
||
id_, key, rest = parts
|
||
# 无法区分 English / Chinese,尝试按中文字符后半部分切分(保守策略)
|
||
m = re.search(r"[\u4e00-\u9fff]+", rest)
|
||
if m:
|
||
idx = m.start()
|
||
eng = rest[:idx].strip()
|
||
chi = rest[idx:].strip()
|
||
else:
|
||
eng = rest
|
||
chi = ""
|
||
out_rows.append((id_, key, eng, chi))
|
||
else:
|
||
# 无法解析的行,跳过或记录为备注行
|
||
# 记录为空的键以便人工检查
|
||
out_rows.append(("", "", line.strip(), ""))
|
||
|
||
# 写入标准 CSV(全部字段用双引号包裹)
|
||
with SRC.open("w", encoding="utf-8", newline="") as out:
|
||
writer = csv.writer(out, quoting=csv.QUOTE_ALL)
|
||
writer.writerow(["Id", "key", "English", "ChineseSimplified"])
|
||
for r in out_rows:
|
||
writer.writerow(r)
|
||
|
||
print(f"Formatted CSV written to {SRC} (backup at {bak})")
|