Post correction.py
parent
9d51ea8394
commit
5f1613b8c1
|
|
@ -17,16 +17,44 @@ if not arg_path.exists():
|
|||
|
||||
import json
|
||||
import ftfy
|
||||
import re
|
||||
import urllib.request
|
||||
|
||||
url = "https://raw.githubusercontent.com/hbenbel/French-Dictionary/master/dictionary/dictionary.txt"
|
||||
french_words = urllib.request.urlopen(url).read().decode('utf-8').splitlines()
|
||||
|
||||
# 2. Pre-compute an O(1) lookup dictionary
|
||||
# We simulate the corruption by replacing accents with null bytes (\x00)
|
||||
lookup_map = {}
|
||||
for word in french_words:
|
||||
# Replace all French accents with \x00 to create the "broken" key
|
||||
broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
|
||||
if '\x00' in broken_key:
|
||||
lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
|
||||
|
||||
# 3. Fast replace function
|
||||
def fast_fix(text):
|
||||
# Find words containing regular letters and null bytes
|
||||
def replacer(match):
|
||||
broken_word = match.group(0)
|
||||
# Return the fixed word from our map, or leave it if not found
|
||||
# (Handles case-insensitivity by falling back to lowercase map)
|
||||
return lookup_map.get(broken_word.lower(), broken_word)
|
||||
|
||||
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
|
||||
|
||||
|
||||
|
||||
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
||||
OUPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json"
|
||||
|
||||
OUTPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json"
|
||||
|
||||
def clean_string(s: str) -> str:
|
||||
# remove null bytes
|
||||
s = s.replace("\x00", "")
|
||||
# fix encoding issues
|
||||
s = ftfy.fix_text(s)
|
||||
s = re.sub(r'\x19', r'\x00', s)
|
||||
s = re.sub(r'\x18', r'\x00', s)
|
||||
s = fast_fix(s)
|
||||
s = s.replace('\x00', '')
|
||||
return s
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue