import sys import os import time from pathlib import Path import argparse if len(sys.argv) < 2: sys.exit("Usage: python script.py ") def escape_latex_underscores(text): r""" Escape '_' outside LaTeX math environments. Supports: - $...$ - $$...$$ - \( ... \) - \[ ... \] """ # Regex matching LaTeX math blocks math_pattern = re.compile( r'(\$\$.*?\$\$|' # $$...$$ r'\$.*?\$|' # $...$ r'\\\(.*?\\\)|' # \( ... \) r'\\\[.*?\\\])', # \[ ... \] re.DOTALL ) parts = [] last_end = 0 for match in math_pattern.finditer(text): start, end = match.span() # Escape underscores outside math outside = text[last_end:start].replace('_', r'\_') parts.append(outside) # Keep math block unchanged parts.append(match.group(0)) last_end = end # Remaining text after last math block outside = text[last_end:].replace('_', r'\_') parts.append(outside) return ''.join(parts) arg_path = Path(sys.argv[1]) tasks = [] # List of tuples: (filepath_str, label_str) results = {} INPUT_DIR = str(arg_path) if not arg_path.exists(): sys.exit(f"Directory {INPUT_DIR} not found.") import json import ftfy import re import urllib.request with open('liste_francais.txt', 'r') as f: french_words = f.read().splitlines() # 2. Pre-compute an O(1) lookup dictionary # We simulate the corruption by replacing accents with null bytes (\x00) lookup_map = {} for word in french_words: # Replace all French accents with \x00 to create the "broken" key broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word) if '\x00' in broken_key: lookup_map[broken_key] = word # e.g., "\x00cole" -> "école" # 3. Fast replace function def fast_fix(text): # Find words containing regular letters and null bytes def replacer(match): broken_word = match.group(0) # Return the fixed word from our map, or leave it if not found # (Handles case-insensitivity by falling back to lowercase map) fixed = lookup_map.get(broken_word.lower()) # if not fixed: # print(f"No match found for: {repr(broken_word)}") return fixed or broken_word return re.sub(r'[a-zA-Z\x00]+', replacer, text) # return text INPUT_FILE = Path(INPUT_DIR) / "correction.json" OUTPUT_FILE = Path(INPUT_DIR) / "correction.json" def fix_hex_corruption_safe(text): # Only matches \x00 followed by hex if it results in an accented character # or common Latin-1 symbols return re.sub(r'\x00([eEfF][0-9a-fA-F])', lambda m: chr(int(m.group(1), 16)), text) def some_other_replacements(s): s = s.replace("\neq", "\\neq") s = s.replace("\not", "\\not") return s def clean_string(s: str) -> str: # fix encoding issues # s = ftfy.fix_text(s) # print(s) s = fix_hex_corruption_safe(s) s = s.replace('\x19', '\x00') s = s.replace('\x18', '\x00') s = s.replace('\x00\x00', '\x00') s = re.sub(r' \x00{1,2} ', ' à ', s) if '\x00' in s: s = fast_fix(s) s = s.replace('\x00', '') s = some_other_replacements(s) return escape_latex_underscores(s) def clean_obj(obj): if isinstance(obj, str): return clean_string(obj) elif isinstance(obj, list): return [clean_obj(x) for x in obj] elif isinstance(obj, dict): return {k: clean_obj(v) for k, v in obj.items()} else: return obj with open(INPUT_FILE, "r", encoding="utf-8") as f: data = json.load(f) data = clean_obj(data) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) print("Fixed JSON saved to", OUTPUT_FILE)