devops/format_language_csv.py
2025-12-12 11:40:38 +08:00

60 lines
2.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import csv
from pathlib import Path
SRC = Path(r"d:\Github\docs\config\LanguageDataTwo.csv")
if not SRC.exists():
print(f"Source not found: {SRC}")
raise SystemExit(1)
# 备份原文件
bak = SRC.with_suffix(SRC.suffix + ".bak")
if not bak.exists():
bak.write_bytes(SRC.read_bytes())
out_rows = []
with SRC.open("r", encoding="utf-8") as f:
for raw in f:
line = raw.rstrip("\n")
if not line.strip():
continue
# 跳过首部注释行(以#开头)
if line.lstrip().startswith("#"):
continue
# 将连续两个或以上空格替换为制表符,便于分割(原文件里有制表或多空格分隔)
norm = re.sub(r"[ \u00A0]{2,}", "\t", line)
# 以制表符分割(如果没有制表符,会退化为按任意空格分割)
parts = [p.strip() for p in re.split(r"\t+", norm)]
# 如果分割后超过4列则把第0,1固定剩下的尽量合并为 English 和 Chinese 两列
if len(parts) >= 4:
id_, key = parts[0], parts[1]
# English 可能包含制表/空格,取中间所有并把最后一列当作中文
eng = " ".join(parts[2:-1]).strip()
chi = parts[-1].strip()
out_rows.append((id_, key, eng, chi))
elif len(parts) == 3:
id_, key, rest = parts
# 无法区分 English / Chinese尝试按中文字符后半部分切分保守策略
m = re.search(r"[\u4e00-\u9fff]+", rest)
if m:
idx = m.start()
eng = rest[:idx].strip()
chi = rest[idx:].strip()
else:
eng = rest
chi = ""
out_rows.append((id_, key, eng, chi))
else:
# 无法解析的行,跳过或记录为备注行
# 记录为空的键以便人工检查
out_rows.append(("", "", line.strip(), ""))
# 写入标准 CSV全部字段用双引号包裹
with SRC.open("w", encoding="utf-8", newline="") as out:
writer = csv.writer(out, quoting=csv.QUOTE_ALL)
writer.writerow(["Id", "key", "English", "ChineseSimplified"])
for r in out_rows:
writer.writerow(r)
print(f"Formatted CSV written to {SRC} (backup at {bak})")