diff --git a/post-correction.py b/post-correction.py index 4a593c9..8d86109 100644 --- a/post-correction.py +++ b/post-correction.py @@ -17,16 +17,44 @@ if not arg_path.exists(): import json import ftfy +import re +import urllib.request + +url = "https://raw.githubusercontent.com/hbenbel/French-Dictionary/master/dictionary/dictionary.txt" +french_words = urllib.request.urlopen(url).read().decode('utf-8').splitlines() + +# 2. Pre-compute an O(1) lookup dictionary +# We simulate the corruption by replacing accents with null bytes (\x00) +lookup_map = {} +for word in french_words: + # Replace all French accents with \x00 to create the "broken" key + broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word) + if '\x00' in broken_key: + lookup_map[broken_key] = word # e.g., "\x00cole" -> "école" + +# 3. Fast replace function +def fast_fix(text): + # Find words containing regular letters and null bytes + def replacer(match): + broken_word = match.group(0) + # Return the fixed word from our map, or leave it if not found + # (Handles case-insensitivity by falling back to lowercase map) + return lookup_map.get(broken_word.lower(), broken_word) + + return re.sub(r'[a-zA-Z\x00]+', replacer, text) + + INPUT_FILE = Path(INPUT_DIR) / "correction.json" -OUPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json" - +OUTPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json" def clean_string(s: str) -> str: - # remove null bytes - s = s.replace("\x00", "") # fix encoding issues s = ftfy.fix_text(s) + s = re.sub(r'\x19', r'\x00', s) + s = re.sub(r'\x18', r'\x00', s) + s = fast_fix(s) + s = s.replace('\x00', '') return s