import re import csv from pathlib import Path SRC = Path(r"d:\Github\docs\config\LanguageDataTwo.csv") if not SRC.exists(): print(f"Source not found: {SRC}") raise SystemExit(1) # 备份原文件 bak = SRC.with_suffix(SRC.suffix + ".bak") if not bak.exists(): bak.write_bytes(SRC.read_bytes()) out_rows = [] with SRC.open("r", encoding="utf-8") as f: for raw in f: line = raw.rstrip("\n") if not line.strip(): continue # 跳过首部注释行(以#开头) if line.lstrip().startswith("#"): continue # 将连续两个或以上空格替换为制表符,便于分割(原文件里有制表或多空格分隔) norm = re.sub(r"[ \u00A0]{2,}", "\t", line) # 以制表符分割(如果没有制表符,会退化为按任意空格分割) parts = [p.strip() for p in re.split(r"\t+", norm)] # 如果分割后超过4列,则把第0,1固定,剩下的尽量合并为 English 和 Chinese 两列 if len(parts) >= 4: id_, key = parts[0], parts[1] # English 可能包含制表/空格,取中间所有并把最后一列当作中文 eng = " ".join(parts[2:-1]).strip() chi = parts[-1].strip() out_rows.append((id_, key, eng, chi)) elif len(parts) == 3: id_, key, rest = parts # 无法区分 English / Chinese,尝试按中文字符后半部分切分(保守策略) m = re.search(r"[\u4e00-\u9fff]+", rest) if m: idx = m.start() eng = rest[:idx].strip() chi = rest[idx:].strip() else: eng = rest chi = "" out_rows.append((id_, key, eng, chi)) else: # 无法解析的行,跳过或记录为备注行 # 记录为空的键以便人工检查 out_rows.append(("", "", line.strip(), "")) # 写入标准 CSV(全部字段用双引号包裹) with SRC.open("w", encoding="utf-8", newline="") as out: writer = csv.writer(out, quoting=csv.QUOTE_ALL) writer.writerow(["Id", "key", "English", "ChineseSimplified"]) for r in out_rows: writer.writerow(r) print(f"Formatted CSV written to {SRC} (backup at {bak})")