Copies/post-correction.py

import sys
import os
import time
from pathlib import Path
import argparse

if len(sys.argv) < 2:
    sys.exit("Usage: python script.py <InputDir>")

def escape_latex_underscores(text):
    r"""
    Escape '_' outside LaTeX math environments.
    Supports:
      - $...$
      - $$...$$
      - \( ... \)
      - \[ ... \]
    """

    # Regex matching LaTeX math blocks
    math_pattern = re.compile(
        r'(\$\$.*?\$\$|'      # $$...$$
        r'\$.*?\$|'           # $...$
        r'\\\(.*?\\\)|'       # \( ... \)
        r'\\\[.*?\\\])',      # \[ ... \]
        re.DOTALL
    )

    parts = []
    last_end = 0

    for match in math_pattern.finditer(text):
        start, end = match.span()

        # Escape underscores outside math
        outside = text[last_end:start].replace('_', r'\_')
        parts.append(outside)

        # Keep math block unchanged
        parts.append(match.group(0))

        last_end = end

    # Remaining text after last math block
    outside = text[last_end:].replace('_', r'\_')
    parts.append(outside)

    return ''.join(parts)

arg_path = Path(sys.argv[1])
tasks = [] # List of tuples: (filepath_str, label_str)
results = {}

INPUT_DIR = str(arg_path)
if not arg_path.exists():
    sys.exit(f"Directory {INPUT_DIR} not found.")

import json
import ftfy
import re
import urllib.request

with open('liste_francais.txt', 'r') as f:
    french_words = f.read().splitlines()

# 2. Pre-compute an O(1) lookup dictionary
# We simulate the corruption by replacing accents with null bytes (\x00)
lookup_map = {}
for word in french_words:
    # Replace all French accents with \x00 to create the "broken" key
    broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
    if '\x00' in broken_key:
        lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"

# 3. Fast replace function
def fast_fix(text):
    # Find words containing regular letters and null bytes
    def replacer(match):
        broken_word = match.group(0)
        # Return the fixed word from our map, or leave it if not found
        # (Handles case-insensitivity by falling back to lowercase map)
        fixed = lookup_map.get(broken_word.lower())
        # if not fixed:
            # print(f"No match found for: {repr(broken_word)}")
        return fixed or broken_word

    return re.sub(r'[a-zA-Z\x00]+', replacer, text)
    # return text


INPUT_FILE = Path(INPUT_DIR) / "correction.json"
OUTPUT_FILE = Path(INPUT_DIR) / "correction.json"

def fix_hex_corruption_safe(text):
    # Only matches \x00 followed by hex if it results in an accented character
    # or common Latin-1 symbols
    return re.sub(r'\x00([eEfF][0-9a-fA-F])',
                  lambda m: chr(int(m.group(1), 16)),
                  text)

def some_other_replacements(s):
    s = s.replace("\neq", "\\neq")
    s = s.replace("\not", "\\not")
    return s


def clean_string(s: str) -> str:
    # fix encoding issues
    # s = ftfy.fix_text(s)
    # print(s)
    s = fix_hex_corruption_safe(s)

    s = s.replace('\x19', '\x00')
    s = s.replace('\x18', '\x00')
    s = s.replace('\x00\x00', '\x00')

    s = re.sub(r' \x00{1,2} ', ' à ', s)

    if '\x00' in s:
        s = fast_fix(s)
        s = s.replace('\x00', '')
    s = some_other_replacements(s)
    return escape_latex_underscores(s)


def clean_obj(obj):
    if isinstance(obj, str):
        return clean_string(obj)

    elif isinstance(obj, list):
        return [clean_obj(x) for x in obj]

    elif isinstance(obj, dict):
        return {k: clean_obj(v) for k, v in obj.items()}

    else:
        return obj


with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

data = clean_obj(data)

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Fixed JSON saved to", OUTPUT_FILE)