miscs

2026-04-23 12:41:50 +02:00 · 2026-04-23 12:41:50 +02:00 · dd0d757fc9
parent bc47f81556
commit dd0d757fc9
4 changed files with 96 additions and 16 deletions
--- a/annotating_by_label.py
+++ b/annotating_by_label.py
@ -12,8 +12,7 @@ import annotating_with_checks
 from utils import natural_key
-# Roughly 10 A4 pages at 100 DPI
+MAX_HEIGHT_PX = 25000 # Can be increased by 10%.
 MAX_HEIGHT_PX = 20000 # Can be increased by 10%.
 def render_item(item):
    student_id, label, content = item
@ -134,15 +133,32 @@ def main():
        shutil.rmtree(bgnot_dir)
    os.makedirs(bgnot_dir, exist_ok=True)
    used_prefixes = set()
    previous_prefix = None
    for line in lines:
        labels = [l.strip() for l in line.split(',') if l.strip()]
        safe_labels = [l.replace(":", "").strip() for l in line.split(',') if l.strip()]
        if not labels:
            continue
-        prefix = os.path.commonprefix(safe_labels).strip()
+        base_prefix = os.path.commonprefix(safe_labels).strip()
-        if not prefix:
+        if not base_prefix:
-            prefix = "Group"
+            base_prefix = "Group"
        unique_prefix = base_prefix
        if unique_prefix[-1] == "i":
            unique_prefix = unique_prefix[:-1]
        counter = 2
        while unique_prefix in used_prefixes:
            unique_prefix = f"{base_prefix}-{counter}"
            counter += 1
        if counter == 2 and previous_prefix and previous_prefix in unique_prefix:
            unique_prefix = f"{previous_prefix}-{counter}"
        elif counter == 2:
            previous_prefx = unique_prefix
        used_prefixes.add(unique_prefix)
        items_to_render = []
        for sid, lbls in results.items():
@ -201,7 +217,7 @@ def main():
            batches = batches2
        for i, batch in enumerate(batches, 1):
-            save_batch(batch, prefix, i, root_dir, args.overwrite)
+            save_batch(batch, unique_prefix, i, root_dir, args.overwrite)
 if __name__ == "__main__":
    main()
--- a/correction.py
+++ b/correction.py
@ -20,7 +20,7 @@ parser.add_argument("--refaire", action="store_true",
                    help="Redo specific copies/labels defined in refaire.json")
 parser.add_argument("--batch", action="store_true",
                    help="Generate a JSONL file of requests to send to the Gemini Batch API")
-parser.add_argument("--deal-with-batched", type=str, metavar="FILE",
+parser.add_argument("--deal-with-batched", action="store_true",
                    help="Process a JSONL file containing completed batch results")
 args, _ = parser.parse_known_args()
@ -236,6 +236,44 @@ class EvaluationEntry(BaseModel):
    id: str = Field(description="Entry identifier")
    result: ResultData = Field(description="Result details")
 # These nested definitions  do not work with the batch api, unroll them
 UNROLLED_SCHEMA = {
                    "type": "ARRAY",
                    "items": {
                        "type": "OBJECT",
                        "properties": {
                            "id": {"type": "STRING", "description": "Entry identifier"},
                            "result": {
                                "type": "OBJECT",
                                "properties": {
                                    "score": {"type": "NUMBER", "description": "The numeric score"},
                                    "confidence": {"type": "NUMBER", "description": "Confidence level"},
                                    "error": {"type": "STRING", "description": "Indicates if an error occurred"},
                                    "feedback": {
                                        "type": "ARRAY",
                                        "description": "List of feedback items",
                                        "items": {
                                            "type": "OBJECT",
                                            "properties": {
                                                "text": {"type": "STRING", "description": "Feedback content"},
                                                "box_2d": {
                                                    "type": "ARRAY",
                                                    "items": {"type": "INTEGER"},
                                                    "nullable": True,
                                                    "description": "box coordinates or null"
                                                }
                                            },
                                            "required": ["text"]
                                        }
                                    }
                                },
                                "required": ["score", "confidence", "feedback", "error"]
                            }
                        },
                        "required": ["id", "result"]
                    }
                }
 # The root model for parsing is be: List[EvaluationEntry]
 def generate_request(file, full_label):
    """Generates request for Gemini."""
@ -780,7 +818,8 @@ if __name__ == "__main__":
                            "topP": 0.95,
                            "maxOutputTokens": 65535,
                            "responseMimeType": "application/json",
-                            "responseSchema": TypeAdapter(List[EvaluationEntry]).json_schema()
+                            "responseSchema": UNROLLED_SCHEMA
                            # TypeAdapter(List[EvaluationEntry]).json_schema()
                        }
                    }
                }
@ -800,7 +839,7 @@ if __name__ == "__main__":
    batched_responses = {}
    if args.deal_with_batched:
-        batch_results_path = Path(args.deal_with_batched)
+        batch_results_path = Path(INPUT_DIR) / "batched_correction_result.jsonl"
        if batch_results_path.exists():
            print(f"Loading batch results from {batch_results_path}...")
            with open(batch_results_path, "r", encoding="utf-8") as f:
--- a/liste_francais.txt
+++ b/liste_francais.txt
@ -7395,6 +7395,7 @@ ennuis
 ennuyer
 ennuyeux
 énoncer
 énoncé
 énonciation
 énorme
 énormément
@ -21071,4 +21072,4 @@ zone
 zones
 zoologie
 zozoter
-zygote
+zygote
--- a/post-correction.py
+++ b/post-correction.py
@ -39,23 +39,47 @@ def fast_fix(text):
        broken_word = match.group(0)
        # Return the fixed word from our map, or leave it if not found
        # (Handles case-insensitivity by falling back to lowercase map)
-        return lookup_map.get(broken_word.lower(), broken_word)
+        fixed = lookup_map.get(broken_word.lower())
        # if not fixed:
            # print(f"No match found for: {repr(broken_word)}")
        return fixed or broken_word
    return re.sub(r'[a-zA-Z\x00]+', replacer, text)
-    return text
+    # return text
 INPUT_FILE = Path(INPUT_DIR) / "correction.json"
 OUTPUT_FILE = Path(INPUT_DIR) / "correction.json"
 def fix_hex_corruption_safe(text):
    # Only matches \x00 followed by hex if it results in an accented character
    # or common Latin-1 symbols
    return re.sub(r'\x00([eEfF][0-9a-fA-F])',
                  lambda m: chr(int(m.group(1), 16)),
                  text)
 def some_other_replacements(s):
    s = s.replace("\neq", "\\neq")
    s = s.replace("\not", "\\not")
    return s
 def clean_string(s: str) -> str:
    # fix encoding issues
-    s = ftfy.fix_text(s)
+    # s = ftfy.fix_text(s)
    # print(s)
    s = fix_hex_corruption_safe(s)
    s = s.replace('\x19', '\x00')
    s = s.replace('\x18', '\x00')
-    s = fast_fix(s)
+    s = s.replace('\x00\x00', '\x00')
-    s = s.replace('\x00', '')
+
-    return s
+    s = re.sub(r' \x00{1,2} ', ' à ', s)
    if '\x00' in s:
        s = fast_fix(s)
        s = s.replace('\x00', '')
    return some_other_replacements(s)
 def clean_obj(obj):