Post correction.py

master
Sébastien Miquel 2026-03-15 10:52:44 +01:00
parent 9d51ea8394
commit 5f1613b8c1
1 changed files with 32 additions and 4 deletions

View File

@ -17,16 +17,44 @@ if not arg_path.exists():
import json
import ftfy
import re
import urllib.request
url = "https://raw.githubusercontent.com/hbenbel/French-Dictionary/master/dictionary/dictionary.txt"
french_words = urllib.request.urlopen(url).read().decode('utf-8').splitlines()
# 2. Pre-compute an O(1) lookup dictionary
# We simulate the corruption by replacing accents with null bytes (\x00)
lookup_map = {}
for word in french_words:
# Replace all French accents with \x00 to create the "broken" key
broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
if '\x00' in broken_key:
lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
# 3. Fast replace function
def fast_fix(text):
# Find words containing regular letters and null bytes
def replacer(match):
broken_word = match.group(0)
# Return the fixed word from our map, or leave it if not found
# (Handles case-insensitivity by falling back to lowercase map)
return lookup_map.get(broken_word.lower(), broken_word)
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
OUPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json"
OUTPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json"
def clean_string(s: str) -> str:
# remove null bytes
s = s.replace("\x00", "")
# fix encoding issues
s = ftfy.fix_text(s)
s = re.sub(r'\x19', r'\x00', s)
s = re.sub(r'\x18', r'\x00', s)
s = fast_fix(s)
s = s.replace('\x00', '')
return s