import sys import os import time from pathlib import Path import argparse if len(sys.argv) < 2: sys.exit("Usage: python script.py ") arg_path = Path(sys.argv[1]) tasks = [] # List of tuples: (filepath_str, label_str) results = {} INPUT_DIR = str(arg_path) if not arg_path.exists(): sys.exit(f"Directory {INPUT_DIR} not found.") import json import ftfy import re import urllib.request with open('liste_francais.txt', 'r') as f: french_words = f.read().splitlines() # 2. Pre-compute an O(1) lookup dictionary # We simulate the corruption by replacing accents with null bytes (\x00) lookup_map = {} for word in french_words: # Replace all French accents with \x00 to create the "broken" key broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word) if '\x00' in broken_key: lookup_map[broken_key] = word # e.g., "\x00cole" -> "école" # 3. Fast replace function def fast_fix(text): # Find words containing regular letters and null bytes def replacer(match): broken_word = match.group(0) # Return the fixed word from our map, or leave it if not found # (Handles case-insensitivity by falling back to lowercase map) return lookup_map.get(broken_word.lower(), broken_word) return re.sub(r'[a-zA-Z\x00]+', replacer, text) return text INPUT_FILE = Path(INPUT_DIR) / "correction.json" OUTPUT_FILE = Path(INPUT_DIR) / "correction.json" def clean_string(s: str) -> str: # fix encoding issues s = ftfy.fix_text(s) s = s.replace('\x19', '\x00') s = s.replace('\x18', '\x00') s = fast_fix(s) s = s.replace('\x00', '') return s def clean_obj(obj): if isinstance(obj, str): return clean_string(obj) elif isinstance(obj, list): return [clean_obj(x) for x in obj] elif isinstance(obj, dict): return {k: clean_obj(v) for k, v in obj.items()} else: return obj with open(INPUT_FILE, "r", encoding="utf-8") as f: data = json.load(f) data = clean_obj(data) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) print("Fixed JSON saved to", OUTPUT_FILE)