diff --git a/post-correction.py b/post-correction.py new file mode 100644 index 0000000..677bb50 --- /dev/null +++ b/post-correction.py @@ -0,0 +1,60 @@ +import sys +import os +import time +from pathlib import Path +import argparse + +if len(sys.argv) < 2: + sys.exit("Usage: python script.py ") + +arg_path = Path(sys.argv[1]) +tasks = [] # List of tuples: (filepath_str, label_str) +results = {} + +INPUT_DIR = str(arg_path) +if not arg_path.exists(): + sys.exit(f"Directory {INPUT_DIR} not found.") + +import ftfy +import ftfy + +clean = ftfy.fix_text(text.replace("\x00", "")) + +import json +import ftfy + +INPUT_FILE = Path(INPUT_DIR) / "correction.json" +OUPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json" + + +def clean_string(s: str) -> str: + # remove null bytes + s = s.replace("\x00", "") + # fix encoding issues + s = ftfy.fix_text(s) + return s + + +def clean_obj(obj): + if isinstance(obj, str): + return clean_string(obj) + + elif isinstance(obj, list): + return [clean_obj(x) for x in obj] + + elif isinstance(obj, dict): + return {k: clean_obj(v) for k, v in obj.items()} + + else: + return obj + + +with open(INPUT_FILE, "r", encoding="utf-8") as f: + data = json.load(f) + +data = clean_obj(data) + +with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + +print("Fixed JSON saved to", OUTPUT_FILE)