108 lines
2.8 KiB
Python
108 lines
2.8 KiB
Python
import sys
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
import argparse
|
|
|
|
if len(sys.argv) < 2:
|
|
sys.exit("Usage: python script.py <InputDir>")
|
|
|
|
arg_path = Path(sys.argv[1])
|
|
tasks = [] # List of tuples: (filepath_str, label_str)
|
|
results = {}
|
|
|
|
INPUT_DIR = str(arg_path)
|
|
if not arg_path.exists():
|
|
sys.exit(f"Directory {INPUT_DIR} not found.")
|
|
|
|
import json
|
|
import ftfy
|
|
import re
|
|
import urllib.request
|
|
|
|
with open('liste_francais.txt', 'r') as f:
|
|
french_words = f.read().splitlines()
|
|
|
|
# 2. Pre-compute an O(1) lookup dictionary
|
|
# We simulate the corruption by replacing accents with null bytes (\x00)
|
|
lookup_map = {}
|
|
for word in french_words:
|
|
# Replace all French accents with \x00 to create the "broken" key
|
|
broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
|
|
if '\x00' in broken_key:
|
|
lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
|
|
|
|
# 3. Fast replace function
|
|
def fast_fix(text):
|
|
# Find words containing regular letters and null bytes
|
|
def replacer(match):
|
|
broken_word = match.group(0)
|
|
# Return the fixed word from our map, or leave it if not found
|
|
# (Handles case-insensitivity by falling back to lowercase map)
|
|
fixed = lookup_map.get(broken_word.lower())
|
|
# if not fixed:
|
|
# print(f"No match found for: {repr(broken_word)}")
|
|
return fixed or broken_word
|
|
|
|
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
|
|
# return text
|
|
|
|
|
|
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
|
OUTPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
|
|
|
def fix_hex_corruption_safe(text):
|
|
# Only matches \x00 followed by hex if it results in an accented character
|
|
# or common Latin-1 symbols
|
|
return re.sub(r'\x00([eEfF][0-9a-fA-F])',
|
|
lambda m: chr(int(m.group(1), 16)),
|
|
text)
|
|
|
|
def some_other_replacements(s):
|
|
s = s.replace("\neq", "\\neq")
|
|
s = s.replace("\not", "\\not")
|
|
return s
|
|
|
|
|
|
def clean_string(s: str) -> str:
|
|
# fix encoding issues
|
|
# s = ftfy.fix_text(s)
|
|
# print(s)
|
|
s = fix_hex_corruption_safe(s)
|
|
|
|
s = s.replace('\x19', '\x00')
|
|
s = s.replace('\x18', '\x00')
|
|
s = s.replace('\x00\x00', '\x00')
|
|
|
|
s = re.sub(r' \x00{1,2} ', ' à ', s)
|
|
|
|
if '\x00' in s:
|
|
s = fast_fix(s)
|
|
s = s.replace('\x00', '')
|
|
return some_other_replacements(s)
|
|
|
|
|
|
def clean_obj(obj):
|
|
if isinstance(obj, str):
|
|
return clean_string(obj)
|
|
|
|
elif isinstance(obj, list):
|
|
return [clean_obj(x) for x in obj]
|
|
|
|
elif isinstance(obj, dict):
|
|
return {k: clean_obj(v) for k, v in obj.items()}
|
|
|
|
else:
|
|
return obj
|
|
|
|
|
|
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
data = clean_obj(data)
|
|
|
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
print("Fixed JSON saved to", OUTPUT_FILE)
|