Post correction.py
parent
9d51ea8394
commit
5f1613b8c1
|
|
@ -17,16 +17,44 @@ if not arg_path.exists():
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import ftfy
|
import ftfy
|
||||||
|
import re
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
url = "https://raw.githubusercontent.com/hbenbel/French-Dictionary/master/dictionary/dictionary.txt"
|
||||||
|
french_words = urllib.request.urlopen(url).read().decode('utf-8').splitlines()
|
||||||
|
|
||||||
|
# 2. Pre-compute an O(1) lookup dictionary
|
||||||
|
# We simulate the corruption by replacing accents with null bytes (\x00)
|
||||||
|
lookup_map = {}
|
||||||
|
for word in french_words:
|
||||||
|
# Replace all French accents with \x00 to create the "broken" key
|
||||||
|
broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
|
||||||
|
if '\x00' in broken_key:
|
||||||
|
lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
|
||||||
|
|
||||||
|
# 3. Fast replace function
|
||||||
|
def fast_fix(text):
|
||||||
|
# Find words containing regular letters and null bytes
|
||||||
|
def replacer(match):
|
||||||
|
broken_word = match.group(0)
|
||||||
|
# Return the fixed word from our map, or leave it if not found
|
||||||
|
# (Handles case-insensitivity by falling back to lowercase map)
|
||||||
|
return lookup_map.get(broken_word.lower(), broken_word)
|
||||||
|
|
||||||
|
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
||||||
OUPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json"
|
OUTPUT_FILE = Path(INPUT_DIR) / "fixed_correction.json"
|
||||||
|
|
||||||
|
|
||||||
def clean_string(s: str) -> str:
|
def clean_string(s: str) -> str:
|
||||||
# remove null bytes
|
|
||||||
s = s.replace("\x00", "")
|
|
||||||
# fix encoding issues
|
# fix encoding issues
|
||||||
s = ftfy.fix_text(s)
|
s = ftfy.fix_text(s)
|
||||||
|
s = re.sub(r'\x19', r'\x00', s)
|
||||||
|
s = re.sub(r'\x18', r'\x00', s)
|
||||||
|
s = fast_fix(s)
|
||||||
|
s = s.replace('\x00', '')
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue