Copies/post-correction.py

56 lines
1.1 KiB
Python

import sys
import os
import time
from pathlib import Path
import argparse
if len(sys.argv) < 2:
sys.exit("Usage: python script.py <InputDir>")
arg_path = Path(sys.argv[1])
tasks = [] # List of tuples: (filepath_str, label_str)
results = {}
INPUT_DIR = str(arg_path)
if not arg_path.exists():
sys.exit(f"Directory {INPUT_DIR} not found.")
import json
import ftfy
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
OUPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json"
def clean_string(s: str) -> str:
# remove null bytes
s = s.replace("\x00", "")
# fix encoding issues
s = ftfy.fix_text(s)
return s
def clean_obj(obj):
if isinstance(obj, str):
return clean_string(obj)
elif isinstance(obj, list):
return [clean_obj(x) for x in obj]
elif isinstance(obj, dict):
return {k: clean_obj(v) for k, v in obj.items()}
else:
return obj
with open(INPUT_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
data = clean_obj(data)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("Fixed JSON saved to", OUTPUT_FILE)