diff --git a/annotating_by_label.py b/annotating_by_label.py index 53168dc..9771416 100644 --- a/annotating_by_label.py +++ b/annotating_by_label.py @@ -12,8 +12,7 @@ import annotating_with_checks from utils import natural_key -# Roughly 10 A4 pages at 100 DPI -MAX_HEIGHT_PX = 20000 # Can be increased by 10%. +MAX_HEIGHT_PX = 25000 # Can be increased by 10%. def render_item(item): student_id, label, content = item @@ -134,15 +133,32 @@ def main(): shutil.rmtree(bgnot_dir) os.makedirs(bgnot_dir, exist_ok=True) + used_prefixes = set() + + previous_prefix = None for line in lines: labels = [l.strip() for l in line.split(',') if l.strip()] safe_labels = [l.replace(":", "").strip() for l in line.split(',') if l.strip()] if not labels: continue - prefix = os.path.commonprefix(safe_labels).strip() - if not prefix: - prefix = "Group" + base_prefix = os.path.commonprefix(safe_labels).strip() + if not base_prefix: + base_prefix = "Group" + + unique_prefix = base_prefix + if unique_prefix[-1] == "i": + unique_prefix = unique_prefix[:-1] + counter = 2 + while unique_prefix in used_prefixes: + unique_prefix = f"{base_prefix}-{counter}" + counter += 1 + if counter == 2 and previous_prefix and previous_prefix in unique_prefix: + unique_prefix = f"{previous_prefix}-{counter}" + elif counter == 2: + previous_prefx = unique_prefix + + used_prefixes.add(unique_prefix) items_to_render = [] for sid, lbls in results.items(): @@ -201,7 +217,7 @@ def main(): batches = batches2 for i, batch in enumerate(batches, 1): - save_batch(batch, prefix, i, root_dir, args.overwrite) + save_batch(batch, unique_prefix, i, root_dir, args.overwrite) if __name__ == "__main__": main() diff --git a/correction.py b/correction.py index f17286f..deaf78c 100644 --- a/correction.py +++ b/correction.py @@ -20,7 +20,7 @@ parser.add_argument("--refaire", action="store_true", help="Redo specific copies/labels defined in refaire.json") parser.add_argument("--batch", action="store_true", help="Generate a JSONL file of requests to send to the Gemini Batch API") -parser.add_argument("--deal-with-batched", type=str, metavar="FILE", +parser.add_argument("--deal-with-batched", action="store_true", help="Process a JSONL file containing completed batch results") args, _ = parser.parse_known_args() @@ -236,6 +236,44 @@ class EvaluationEntry(BaseModel): id: str = Field(description="Entry identifier") result: ResultData = Field(description="Result details") +# These nested definitions do not work with the batch api, unroll them +UNROLLED_SCHEMA = { + "type": "ARRAY", + "items": { + "type": "OBJECT", + "properties": { + "id": {"type": "STRING", "description": "Entry identifier"}, + "result": { + "type": "OBJECT", + "properties": { + "score": {"type": "NUMBER", "description": "The numeric score"}, + "confidence": {"type": "NUMBER", "description": "Confidence level"}, + "error": {"type": "STRING", "description": "Indicates if an error occurred"}, + "feedback": { + "type": "ARRAY", + "description": "List of feedback items", + "items": { + "type": "OBJECT", + "properties": { + "text": {"type": "STRING", "description": "Feedback content"}, + "box_2d": { + "type": "ARRAY", + "items": {"type": "INTEGER"}, + "nullable": True, + "description": "box coordinates or null" + } + }, + "required": ["text"] + } + } + }, + "required": ["score", "confidence", "feedback", "error"] + } + }, + "required": ["id", "result"] + } + } + # The root model for parsing is be: List[EvaluationEntry] def generate_request(file, full_label): """Generates request for Gemini.""" @@ -780,7 +818,8 @@ if __name__ == "__main__": "topP": 0.95, "maxOutputTokens": 65535, "responseMimeType": "application/json", - "responseSchema": TypeAdapter(List[EvaluationEntry]).json_schema() + "responseSchema": UNROLLED_SCHEMA + # TypeAdapter(List[EvaluationEntry]).json_schema() } } } @@ -800,7 +839,7 @@ if __name__ == "__main__": batched_responses = {} if args.deal_with_batched: - batch_results_path = Path(args.deal_with_batched) + batch_results_path = Path(INPUT_DIR) / "batched_correction_result.jsonl" if batch_results_path.exists(): print(f"Loading batch results from {batch_results_path}...") with open(batch_results_path, "r", encoding="utf-8") as f: diff --git a/liste_francais.txt b/liste_francais.txt index e0bb3e7..4804146 100644 --- a/liste_francais.txt +++ b/liste_francais.txt @@ -7395,6 +7395,7 @@ ennuis ennuyer ennuyeux énoncer +énoncé énonciation énorme énormément @@ -21071,4 +21072,4 @@ zone zones zoologie zozoter -zygote \ No newline at end of file +zygote diff --git a/post-correction.py b/post-correction.py index c9aa59c..0b89e23 100644 --- a/post-correction.py +++ b/post-correction.py @@ -39,23 +39,47 @@ def fast_fix(text): broken_word = match.group(0) # Return the fixed word from our map, or leave it if not found # (Handles case-insensitivity by falling back to lowercase map) - return lookup_map.get(broken_word.lower(), broken_word) + fixed = lookup_map.get(broken_word.lower()) + # if not fixed: + # print(f"No match found for: {repr(broken_word)}") + return fixed or broken_word return re.sub(r'[a-zA-Z\x00]+', replacer, text) - return text + # return text INPUT_FILE = Path(INPUT_DIR) / "correction.json" OUTPUT_FILE = Path(INPUT_DIR) / "correction.json" +def fix_hex_corruption_safe(text): + # Only matches \x00 followed by hex if it results in an accented character + # or common Latin-1 symbols + return re.sub(r'\x00([eEfF][0-9a-fA-F])', + lambda m: chr(int(m.group(1), 16)), + text) + +def some_other_replacements(s): + s = s.replace("\neq", "\\neq") + s = s.replace("\not", "\\not") + return s + + def clean_string(s: str) -> str: # fix encoding issues - s = ftfy.fix_text(s) + # s = ftfy.fix_text(s) + # print(s) + s = fix_hex_corruption_safe(s) + s = s.replace('\x19', '\x00') s = s.replace('\x18', '\x00') - s = fast_fix(s) - s = s.replace('\x00', '') - return s + s = s.replace('\x00\x00', '\x00') + + s = re.sub(r' \x00{1,2} ', ' à ', s) + + if '\x00' in s: + s = fast_fix(s) + s = s.replace('\x00', '') + return some_other_replacements(s) def clean_obj(obj):