Copies/post-correction.py

84 lines
2.3 KiB
Python

import sys
import os
import time
from pathlib import Path
import argparse
if len(sys.argv) < 2:
sys.exit("Usage: python script.py <InputDir>")
arg_path = Path(sys.argv[1])
tasks = [] # List of tuples: (filepath_str, label_str)
results = {}
INPUT_DIR = str(arg_path)
if not arg_path.exists():
sys.exit(f"Directory {INPUT_DIR} not found.")
import json
import ftfy
import re
import urllib.request
# url = "https://raw.githubusercontent.com/hbenbel/French-Dictionary/master/dictionary/dictionary.txt"
# french_words = urllib.request.urlopen(url).read().decode('utf-8').splitlines()
# 2. Pre-compute an O(1) lookup dictionary
# We simulate the corruption by replacing accents with null bytes (\x00)
# lookup_map = {}
# for word in french_words:
# # Replace all French accents with \x00 to create the "broken" key
# broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
# if '\x00' in broken_key:
# lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
# 3. Fast replace function
def fast_fix(text):
# Find words containing regular letters and null bytes
# def replacer(match):
# broken_word = match.group(0)
# # Return the fixed word from our map, or leave it if not found
# # (Handles case-insensitivity by falling back to lowercase map)
# return lookup_map.get(broken_word.lower(), broken_word)
# return re.sub(r'[a-zA-Z\x00]+', replacer, text)
return text
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
OUTPUT_FILE = Path(INPUT_DIR) / "correction.json"
def clean_string(s: str) -> str:
# fix encoding issues
s = ftfy.fix_text(s)
s = s.replace('\x19', '\x00')
s = s.replace('\x18', '\x00')
s = fast_fix(s)
s = s.replace('\x00', '')
return s
def clean_obj(obj):
if isinstance(obj, str):
return clean_string(obj)
elif isinstance(obj, list):
return [clean_obj(x) for x in obj]
elif isinstance(obj, dict):
return {k: clean_obj(v) for k, v in obj.items()}
else:
return obj
with open(INPUT_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
data = clean_obj(data)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("Fixed JSON saved to", OUTPUT_FILE)