miscs

2026-04-23 12:41:50 +02:00 · 2026-04-23 12:41:50 +02:00 · dd0d757fc9
parent bc47f81556
commit dd0d757fc9
4 changed files with 96 additions and 16 deletions
--- a/annotating_by_label.py
+++ b/annotating_by_label.py
@ -12,8 +12,7 @@ import annotating_with_checks

 from utils import natural_key

-# Roughly 10 A4 pages at 100 DPI
-MAX_HEIGHT_PX = 20000 # Can be increased by 10%.
+MAX_HEIGHT_PX = 25000 # Can be increased by 10%.

 def render_item(item):
    student_id, label, content = item
@ -134,15 +133,32 @@ def main():
        shutil.rmtree(bgnot_dir)
    os.makedirs(bgnot_dir, exist_ok=True)

+    used_prefixes = set()
+
+    previous_prefix = None
    for line in lines:
        labels = [l.strip() for l in line.split(',') if l.strip()]
        safe_labels = [l.replace(":", "").strip() for l in line.split(',') if l.strip()]
        if not labels:
            continue

-        prefix = os.path.commonprefix(safe_labels).strip()
-        if not prefix:
-            prefix = "Group"
+        base_prefix = os.path.commonprefix(safe_labels).strip()
+        if not base_prefix:
+            base_prefix = "Group"
+
+        unique_prefix = base_prefix
+        if unique_prefix[-1] == "i":
+            unique_prefix = unique_prefix[:-1]
+        counter = 2
+        while unique_prefix in used_prefixes:
+            unique_prefix = f"{base_prefix}-{counter}"
+            counter += 1
+        if counter == 2 and previous_prefix and previous_prefix in unique_prefix:
+            unique_prefix = f"{previous_prefix}-{counter}"
+        elif counter == 2:
+            previous_prefx = unique_prefix
+
+        used_prefixes.add(unique_prefix)

        items_to_render = []
        for sid, lbls in results.items():
@ -201,7 +217,7 @@ def main():
            batches = batches2

        for i, batch in enumerate(batches, 1):
-            save_batch(batch, prefix, i, root_dir, args.overwrite)
+            save_batch(batch, unique_prefix, i, root_dir, args.overwrite)

 if __name__ == "__main__":
    main()
--- a/correction.py
+++ b/correction.py
@ -20,7 +20,7 @@ parser.add_argument("--refaire", action="store_true",
                    help="Redo specific copies/labels defined in refaire.json")
 parser.add_argument("--batch", action="store_true",
                    help="Generate a JSONL file of requests to send to the Gemini Batch API")
-parser.add_argument("--deal-with-batched", type=str, metavar="FILE",
+parser.add_argument("--deal-with-batched", action="store_true",
                    help="Process a JSONL file containing completed batch results")
 args, _ = parser.parse_known_args()

@ -236,6 +236,44 @@ class EvaluationEntry(BaseModel):
    id: str = Field(description="Entry identifier")
    result: ResultData = Field(description="Result details")

+# These nested definitions  do not work with the batch api, unroll them
+UNROLLED_SCHEMA = {
+                    "type": "ARRAY",
+                    "items": {
+                        "type": "OBJECT",
+                        "properties": {
+                            "id": {"type": "STRING", "description": "Entry identifier"},
+                            "result": {
+                                "type": "OBJECT",
+                                "properties": {
+                                    "score": {"type": "NUMBER", "description": "The numeric score"},
+                                    "confidence": {"type": "NUMBER", "description": "Confidence level"},
+                                    "error": {"type": "STRING", "description": "Indicates if an error occurred"},
+                                    "feedback": {
+                                        "type": "ARRAY",
+                                        "description": "List of feedback items",
+                                        "items": {
+                                            "type": "OBJECT",
+                                            "properties": {
+                                                "text": {"type": "STRING", "description": "Feedback content"},
+                                                "box_2d": {
+                                                    "type": "ARRAY",
+                                                    "items": {"type": "INTEGER"},
+                                                    "nullable": True,
+                                                    "description": "box coordinates or null"
+                                                }
+                                            },
+                                            "required": ["text"]
+                                        }
+                                    }
+                                },
+                                "required": ["score", "confidence", "feedback", "error"]
+                            }
+                        },
+                        "required": ["id", "result"]
+                    }
+                }
+
 # The root model for parsing is be: List[EvaluationEntry]
 def generate_request(file, full_label):
    """Generates request for Gemini."""
@ -780,7 +818,8 @@ if __name__ == "__main__":
                            "topP": 0.95,
                            "maxOutputTokens": 65535,
                            "responseMimeType": "application/json",
-                            "responseSchema": TypeAdapter(List[EvaluationEntry]).json_schema()
+                            "responseSchema": UNROLLED_SCHEMA
+                            # TypeAdapter(List[EvaluationEntry]).json_schema()
                        }
                    }
                }
@ -800,7 +839,7 @@ if __name__ == "__main__":

    batched_responses = {}
    if args.deal_with_batched:
-        batch_results_path = Path(args.deal_with_batched)
+        batch_results_path = Path(INPUT_DIR) / "batched_correction_result.jsonl"
        if batch_results_path.exists():
            print(f"Loading batch results from {batch_results_path}...")
            with open(batch_results_path, "r", encoding="utf-8") as f:
--- a/liste_francais.txt
+++ b/liste_francais.txt
@ -7395,6 +7395,7 @@ ennuis
 ennuyer
 ennuyeux
 énoncer
+énoncé
 énonciation
 énorme
 énormément
--- a/post-correction.py
+++ b/post-correction.py
@ -39,23 +39,47 @@ def fast_fix(text):
        broken_word = match.group(0)
        # Return the fixed word from our map, or leave it if not found
        # (Handles case-insensitivity by falling back to lowercase map)
-        return lookup_map.get(broken_word.lower(), broken_word)
+        fixed = lookup_map.get(broken_word.lower())
+        # if not fixed:
+            # print(f"No match found for: {repr(broken_word)}")
+        return fixed or broken_word

    return re.sub(r'[a-zA-Z\x00]+', replacer, text)
-    return text
+    # return text


 INPUT_FILE = Path(INPUT_DIR) / "correction.json"
 OUTPUT_FILE = Path(INPUT_DIR) / "correction.json"

+def fix_hex_corruption_safe(text):
+    # Only matches \x00 followed by hex if it results in an accented character
+    # or common Latin-1 symbols
+    return re.sub(r'\x00([eEfF][0-9a-fA-F])',
+                  lambda m: chr(int(m.group(1), 16)),
+                  text)
+
+def some_other_replacements(s):
+    s = s.replace("\neq", "\\neq")
+    s = s.replace("\not", "\\not")
+    return s
+
+
 def clean_string(s: str) -> str:
    # fix encoding issues
-    s = ftfy.fix_text(s)
+    # s = ftfy.fix_text(s)
+    # print(s)
+    s = fix_hex_corruption_safe(s)
+
    s = s.replace('\x19', '\x00')
    s = s.replace('\x18', '\x00')
-    s = fast_fix(s)
-    s = s.replace('\x00', '')
-    return s
+    s = s.replace('\x00\x00', '\x00')
+
+    s = re.sub(r' \x00{1,2} ', ' à ', s)
+
+    if '\x00' in s:
+        s = fast_fix(s)
+        s = s.replace('\x00', '')
+    return some_other_replacements(s)


 def clean_obj(obj):