From 27c0dae20ecdf987617966a585d46ad74f4856e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Miquel?= Date: Sat, 6 Jun 2026 22:09:00 +0200 Subject: [PATCH] Miscs --- Readme.org | 16 +++++- correction.py | 133 ++++++++++++++++++++++++++++++++++++------- gemini_for_enonce.py | 113 ++++++++++++++++++++++++++++-------- gemini_for_labels.py | 16 +++++- page_splitter.py | 84 ++++++++++++++++++++------- plotting.py | 8 +++ prompting.py | 4 ++ rename_to_copie.sh | 5 ++ rotate_all.sh | 5 ++ 9 files changed, 315 insertions(+), 69 deletions(-) diff --git a/Readme.org b/Readme.org index 8bd144a..56c2c6d 100644 --- a/Readme.org +++ b/Readme.org @@ -1,7 +1,7 @@ #+title: Script #+author: Sébastien Miquel #+date: 14-03-2026 -# Time-stamp: <06-06-26 10:10> +# Time-stamp: <06-06-26 16:23> #+OPTIONS: * Méta @@ -85,7 +85,17 @@ export GEMINI_API_KEY=… 6. Suivre les étapes plus bas. * Étapes et Script -** Prétraitement +** Prétraitement de l'énoncé + + - Dans le dossier de l'évaluation, mettre : + + `enonce.pdf` + + `enonce.tex` + + `correction.tex`. + - `python gemini_for_enonce.py Interro` + Se charge de créer des dossiers `Text` et `Sol` avec + + Le fichier `Text` contient + +** Prétraitement des copies 1. =./rotate_all.sh Interro= (facultatif) @@ -248,7 +258,7 @@ OU 4. On peut faire des changements manuels aux =score.json= ici, puis - `python reading_annotations.py --update-score Interro` - `python reading_grouped_annotations.py --update-score Interro` - pour mettre à jour les scores dans les images. + pour mettre à jour les scores dans les images. 4. (gestion perso) + =gestion_classe ne= pour créer l'interro puis + =gestion_classe we= (set barème here) diff --git a/correction.py b/correction.py index 2fa2a78..a602b9b 100644 --- a/correction.py +++ b/correction.py @@ -186,7 +186,7 @@ def call_gemini_with_retries(model_id, contents, config, tprint(f"\tGemini Pro minute limit hit. Waiting {wait_time:.1f}s...") time.sleep(wait_time) continue # Retry same model - + # Immediately fallback to Flash without waiting if it's a Pro quota error if is_quota_error and model_id == MODEL_ID_pro and fallback_model_id: tprint(f"\tGemini Pro quota hit ({e}). \n\n\tFalling back to Flash permanently...") @@ -269,8 +269,9 @@ def handle_label_errors(pid, label, res, pdf_path): new_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{new_label}_new.pdf" if base_new_pdf_path.exists() or new_pdf_path.exists(): - tprint(f"\t\tCopie{pid} tried to move wrong {label} to {new_label}, but it already exists.") - res["error"] = f"wrg-lbl:{new_label}?exists" + tprint(f"\t\tCopie{pid} tried to move wrong {label} to {new_label}, but it already exists. Delaying.") + # res["error"] = f"wrg-lbl:{new_label}?exists" + res["error"] = f"wrg-lbl:{new_label}?delayed" else: res["error"] = f"wrg-lbl-moved-to:{new_label}" tprint(f"\t\tCopie{pid} : moving wrong {label} to {new_label}.") @@ -323,8 +324,9 @@ def handle_label_errors(pid, label, res, pdf_path): keep_error = True else: keep_error = True - error += f"(xx){add_label}" - tprint(f"\t\tAlready present (not copied) Copie{pid} : {label} -> {add_label}") + # error += f"(xx){add_label}" + error += f"(delayed){add_label}" + tprint(f"\t\tAlready present (not copied) Copie{pid} : {label} -> {add_label}. Delaying.") if not keep_error: res["error"] = "" else: @@ -487,6 +489,80 @@ def process_single_task(task_tuple, precomputed_response=None): finally: flush_thread_log() +def resolve_delayed_moves(): + """Scans the current results to find delayed moves and executes them if space was freed.""" + new_tasks = [] + with io_lock: + for label, batches in results.items(): + for batch in batches: + for p in batch: + err = p.get("result", {}).get("error", "") + if not err or ("?delayed" not in err and "(delayed)" not in err): + continue + + pid = p["id"] + pdf_path = COPIES_DIR / f"Copie{pid}" / f"{label}.pdf" + + if not pdf_path.exists(): + if pdf_path.with_name(f"{label}_new.pdf").exists(): + pdf_path = pdf_path.with_name(f"{label}_new.pdf") + elif pdf_path.with_name(f"{label}_old.pdf").exists(): + pdf_path = pdf_path.with_name(f"{label}_old.pdf") + + # 1. Résolution de wrong-label + if err.startswith("wrg-lbl:") and "?delayed" in err: + new_label = err.split(":")[1].split("?")[0] + base_new_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{new_label}.pdf" + new_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{new_label}_new.pdf" + + # Si la place s'est libérée (l'ancien a été bougé vers _old) + if not base_new_pdf_path.exists() and not new_pdf_path.exists(): + tprint(f"Resolving delayed move: Copie{pid} {label} -> {new_label}") + p["result"]["error"] = f"wrg-lbl-moved-to:{new_label}" + p["result"]["suffixe"] = "_old" # Très important pour l'ignorer ensuite + + shutil.copy(str(pdf_path), str(new_pdf_path)) + old_pdf_path = pdf_path.with_name(f"{label}_old.pdf") + if pdf_path != old_pdf_path: + shutil.move(str(pdf_path), str(old_pdf_path)) + + idx = get_next_group_idx(new_label) + height = grouping.get_pdf_height(str(new_pdf_path)) + grouping.create_jpg(new_label, idx, [(pid, str(new_pdf_path), height)], GROUPS_DIR) + new_tasks.append((str(GROUPS_DIR / new_label / f"Group_{idx+1}.jpg"), new_label, False)) + + # 2. Résolution de additional-answer + elif err.startswith("al:") and "(delayed)" in err: + import re + delayed_matches = re.findall(r'\(delayed\)([^?()]+)', err) + new_err = err + resolved_any = False + + for add_label in delayed_matches: + base_add_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{add_label}.pdf" + add_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{add_label}_new.pdf" + + if not base_add_pdf_path.exists() and not add_pdf_path.exists(): + tprint(f"Resolving delayed additional-answer: Copie{pid} {label} -> {add_label}") + new_err = new_err.replace(f"(delayed){add_label}", f"(->){add_label}") + resolved_any = True + + shutil.copy(str(pdf_path), str(add_pdf_path)) + idx = get_next_group_idx(add_label) + height = grouping.get_pdf_height(str(add_pdf_path)) + grouping.create_jpg(add_label, idx, [(pid, str(add_pdf_path), height)], GROUPS_DIR) + new_tasks.append((str(GROUPS_DIR / add_label / f"Group_{idx+1}.jpg"), add_label, False)) + + if resolved_any: + p["result"]["error"] = new_err + + if new_tasks: + # Sauvegarder les modifications d'erreurs (les tags delayed enlevés) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(results, f, indent=2) + + return new_tasks + if __name__ == "__main__": if args.refaire: refaire_path = INPUT_DIR / "refaire.json" @@ -666,24 +742,37 @@ if __name__ == "__main__": else: print(f"Warning: Batch results file {batch_results_path} not found.", file=sys.stderr) - print(f"Starting processing on {len(tasks_to_process)} tasks with {NB_THREADS} threads...") - with concurrent.futures.ThreadPoolExecutor(max_workers=NB_THREADS) as executor: - futures = {} - for task in tasks_to_process: - file_path = task[0] - precomp = batched_responses.get(file_path) - futures[executor.submit(process_single_task, task, precomp)] = task + made_progress = True + while tasks_to_process or made_progress: + if tasks_to_process: + print(f"Starting processing on {len(tasks_to_process)} tasks with {NB_THREADS} threads...") + with concurrent.futures.ThreadPoolExecutor(max_workers=NB_THREADS) as executor: + futures = {} + for task in tasks_to_process: + file_path = task[0] + precomp = batched_responses.get(file_path) + futures[executor.submit(process_single_task, task, precomp)] = task - # Process tasks as they complete, allowing dynamic task addition - for future in concurrent.futures.as_completed(futures): - try: - new_generated_tasks = future.result() - if new_generated_tasks: - for new_task in new_generated_tasks: - # New tasks from wrong-label/additional-answer will fallback to live API - futures[executor.submit(process_single_task, new_task)] = new_task - except Exception as e: - print(f"Exception during task execution: {e}", file=sys.stderr) + for future in concurrent.futures.as_completed(futures): + try: + new_generated_tasks = future.result() + if new_generated_tasks: + for new_task in new_generated_tasks: + futures[executor.submit(process_single_task, new_task)] = new_task + except Exception as e: + print(f"Exception during task execution: {e}", file=sys.stderr) + + tasks_to_process = [] # Vider la liste une fois traitée + + # Après avoir traité toutes les tâches actuelles (live ou batched), + # on tente de débloquer les mouvements qui étaient en attente + delayed_tasks = resolve_delayed_moves() + if delayed_tasks: + print(f"Resolved {len(delayed_tasks)} delayed moves! Running executor for new tasks...") + tasks_to_process.extend(delayed_tasks) + made_progress = True + else: + made_progress = False end_time = time.time() print("Time elapsed : ", end_time - start_time) diff --git a/gemini_for_enonce.py b/gemini_for_enonce.py index a9efb33..4064b21 100644 --- a/gemini_for_enonce.py +++ b/gemini_for_enonce.py @@ -1,4 +1,6 @@ +import shlex import os +import subprocess import sys import argparse from pathlib import Path @@ -7,7 +9,9 @@ from typing import List from google import genai from google.genai import types -MODEL_ID = "gemini-3-flash-preview" +# Bug : l'output est limité à 8k token… +# MODEL_ID = "gemini-3-flash-preview" +MODEL_ID = "gemini-3.1-flash-lite" api_key = os.environ.get("GEMINI_API_KEY") class QuestionItem(BaseModel): @@ -84,14 +88,25 @@ def process_exam(folder_path: str): response_json_schema=ExamExtraction.model_json_schema(), ) - print("Sending request to Gemini...") - response = client.models.generate_content( - model=MODEL_ID, - contents=contents, - config=config - ) + cache_file = folder / "gemini_response.json" - extracted_data = ExamExtraction.model_validate_json(response.text) + if cache_file.is_file(): + print("Loading cached response from gemini_response.json...") + response_text = cache_file.read_text(encoding="utf-8") + else: + print("Sending request to Gemini...") + response = client.models.generate_content( + model=MODEL_ID, + contents=contents, + config=config + ) + response_text = response.text + + print("Saving response to cache...") + cache_file.write_text(response_text, encoding="utf-8") + + # Validate from the text variable (cached or fresh) + extracted_data = ExamExtraction.model_validate_json(response_text) # 2. Setup output directories text_dir = folder / "Text" @@ -101,28 +116,80 @@ def process_exam(folder_path: str): labels_file = folder / "labels" - print("Writing files...") + # Step 1: Write initial labels + print("Writing initial labels file...") with open(labels_file, "w", encoding="utf-8") as flabels: for q in extracted_data.questions: - # Sanitize label for filesystem (prevent directory traversal if label contains '/') - safe_label = q.label.replace("/", "_") + flabels.write(f"{q.label}\n") - flabels.write(f"{safe_label}\n") + # Step 2: Open labels file for user editing + print("Opening labels file for editing...") + editor = os.environ.get("EDITOR") + try: + if editor: + subprocess.run(shlex.split(editor) + [str(labels_file)]) + else: + # Fallbacks if $EDITOR is not set + if sys.platform.startswith("linux"): + subprocess.Popen(["xdg-open", str(labels_file)]) + elif sys.platform == "darwin": + subprocess.Popen(["open", str(labels_file)]) + else: + os.startfile(str(labels_file)) - # Fix double-escaped newlines - q_content = q.question_content.replace("\\n", "\n") - s_content = q.solution_content.replace("\\n", "\n") - + # xdg-open/open usually do not block, so we wait for user confirmation + input("Press ENTER here once you have saved and closed the labels file...") + except Exception: + print("Error running editor, using labels as given.") - # Write Text/label - with open(text_dir / safe_label, "w", encoding="utf-8") as f: - f.write(f"{q.label}\n{q.question_content}") + # Step 3 & 4: Read the edited file back and create a mapping + with open(labels_file, "r", encoding="utf-8") as flabels: + edited_lines = [line.strip() for line in flabels if line.strip()] - # Write Sol/label - with open(sol_dir / safe_label, "w", encoding="utf-8") as f: - f.write(f"{q.label}\n{q.solution_content}") + mapping = [] + final_labels = [] + orig_idx = 0 - print(f"Success! Processed {len(extracted_data.questions)} questions.") + for line in edited_lines: + if line.startswith("+"): + new_label = line[1:].lstrip() + final_labels.append(new_label) + # New label, no source content + mapping.append((new_label, None)) + else: + new_label = line + final_labels.append(new_label) + # Map to initial order, advancing index only for non-'+' items + q_item = extracted_data.questions[orig_idx] if orig_idx < len(extracted_data.questions) else None + mapping.append((new_label, q_item)) + orig_idx += 1 + + # Rewrite the labels file cleanly (removing '+' prefixes) + with open(labels_file, "w", encoding="utf-8") as flabels: + for lbl in final_labels: + flabels.write(f"{lbl}\n") + + # Step 5: Write the final question and solution files + print("Writing question and solution files...") + for new_label, q_item in mapping: + safe_label = new_label.replace("/", "_") + + if q_item: + q_content = q_item.question_content.replace("\\n", "\n") + s_content = q_item.solution_content.replace("\\n", "\n") + else: + q_content = "" + s_content = "" + + # Write Text/label + with open(text_dir / safe_label, "w", encoding="utf-8") as f: + f.write(f"{new_label}\n{q_content}") + + # Write Sol/label + with open(sol_dir / safe_label, "w", encoding="utf-8") as f: + f.write(f"{new_label}\n{s_content}") + + print(f"Success! Processed {len(mapping)} labels.") if __name__ == "__main__": if not api_key: diff --git a/gemini_for_labels.py b/gemini_for_labels.py index 402d7bf..ca1adf8 100644 --- a/gemini_for_labels.py +++ b/gemini_for_labels.py @@ -61,6 +61,8 @@ be missing. ##labels## +##wrong_labels## + Here's a list of the names of the students, pick the one that matches the best or `\"Unknown\"` if you cannot read the name @@ -116,6 +118,8 @@ be missing. ##labels## +##wrong_labels## + Since this copy isn't the first part of a sequence, simply set the name to `\"Continued\"`.""" @@ -128,7 +132,7 @@ class AnnotationData(BaseModel): list: List[BoxItem] = Field(description="List of bounding box items") -def generate_request(file, labels, names, context_labels): +def generate_request(file, labels, names, context_labels, wrong_labels): """Generates request for Gemini with context.""" image_path = Path(file) @@ -142,6 +146,11 @@ def generate_request(file, labels, names, context_labels): else: text = my_prompt2.replace("##labels##", labels)\ .replace("##prev_context##", context_str) + if wrong_labels: + text= text.replace("##wrong_labels##\n\n", f"On a previous request, you answered with the following wrong labels : {wrong_labels}. These are wrong, since they do not exactly match any of the labels in the previous list.") + else: + text = text.replace("##wrong_labels##\n\n", "") + contents = [ types.Content( @@ -271,12 +280,14 @@ def process_copy_group(group_key, files): print(f"[{group_key}] Processing {image_file.name} with {len(accumulated_labels)} accumulated labels...") attempt = -1 + wrong_labels = [] while True: attempt += 1 if attempt > 0: time.sleep(10 * attempt) try: - contents, config = generate_request(image_file, labels_txt, names_txt, accumulated_labels) + contents, config = generate_request(image_file, labels_txt, names_txt, accumulated_labels, + wrong_labels) response = client.models.generate_content( model=MODEL_ID, @@ -289,6 +300,7 @@ def process_copy_group(group_key, files): name = annota.name if unknown: print(f"Error: {image_file.name} contained unknown labels: {unknown}") + wrong_labels.extend(unknown) print("Retrying request...") continue # Retry immediately diff --git a/page_splitter.py b/page_splitter.py index f0bbd39..63c9084 100644 --- a/page_splitter.py +++ b/page_splitter.py @@ -15,7 +15,8 @@ from pypdf import PdfReader, PdfWriter CM_TO_POINTS = (1 / 2.54) * 72 def list_pdf_files(directory): - return list(reversed(sorted(glob.glob(os.path.join(directory, "*.pdf"))))) + l = list(reversed(sorted(glob.glob(os.path.join(directory, "*.pdf"))))) + return [u for u in l if "enonce" not in u] class PDFPreviewer: @@ -84,7 +85,10 @@ class PDFPreviewer: self.num = 0 self.global_rotation = 0 # Rotation appliquée à tous les fichiers self.history = [] - self.setup_next_file() + if not self.setup_next_file(): + print(f"Aucun fichier PDF valide trouvé dans : {path}") + master.destroy() + return self._resize_job = None # For debouncing resize events @@ -462,30 +466,72 @@ class PDFPreviewer: ri = 0 i = 0 while i < len(ps): - # Si c'est une copie double - if (ps[i]['keep'] == "both" or ps[i]['keep'] == "right") \ - and i < len(ps)-1 and (ps[i+1]['keep'] != "right"): - shutil.copy2(self.split_filename_right(i), self.reorder_filename(ri)) - ri += 1 - if ps[i+1]['keep'] != "none": + psk = ps[i]['keep'] + + # Si c'est une copie double (on s'assure qu'on a bien 2 pages consécutives modifiables) + if psk in ["both", "right", "left", "none"] and i < len(ps)-1 and ps[i+1]['keep'] in ["both", "right", "left", "none"]: + + # 1. Page de garde (Extérieur Droit) + if ps[i]['keep'] in ["both", "right"]: + shutil.copy2(self.split_filename_right(i), self.reorder_filename(ri)) + ri += 1 + + # 2. Intérieur Gauche + if ps[i+1]['keep'] in ["both", "left"]: shutil.copy2(self.split_filename_left(i+1), self.reorder_filename(ri)) ri += 1 - if ps[i+1]['keep'] != "left": - shutil.copy2(self.split_filename_right(i+1), self.reorder_filename(ri)) - ri += 1 - if ps[i]['keep'] == "both": - shutil.copy2(self.split_filename_left(i), self.reorder_filename(ri)) - ri += 1 - i += 2 - else: - psk = ps[i]['keep'] - if psk == "left" or psk == "both" or psk == "as_is": + + # 3. Intérieur Droit + if ps[i+1]['keep'] in ["both", "right"]: + shutil.copy2(self.split_filename_right(i+1), self.reorder_filename(ri)) + ri += 1 + + # 4. Dos de la copie (Extérieur Gauche) + if ps[i]['keep'] in ["both", "left"]: shutil.copy2(self.split_filename_left(i), self.reorder_filename(ri)) ri += 1 - if psk == "right" or psk == "both": + + i += 2 + else: + # Si c'est une page simple (ou as_is) + if psk in ["left", "both", "as_is"]: + shutil.copy2(self.split_filename_left(i), self.reorder_filename(ri)) + ri += 1 + if psk in ["right", "both"]: shutil.copy2(self.split_filename_right(i), self.reorder_filename(ri)) ri += 1 i += 1 + # def reorder_pdfs(self): + # """Reordonne les pages, si ce sont des copies doubles.""" + # self.clean_up_dir(self.reorder_dir) + # ps = self.page_settings + # ri = 0 + # i = 0 + # while i < len(ps): + # # Si c'est une copie double + # if (ps[i]['keep'] == "both" or ps[i]['keep'] == "right") \ + # and i < len(ps)-1 and (ps[i+1]['keep'] != "right"): + # shutil.copy2(self.split_filename_right(i), self.reorder_filename(ri)) + # ri += 1 + # if ps[i+1]['keep'] != "none": + # shutil.copy2(self.split_filename_left(i+1), self.reorder_filename(ri)) + # ri += 1 + # if ps[i+1]['keep'] != "left": + # shutil.copy2(self.split_filename_right(i+1), self.reorder_filename(ri)) + # ri += 1 + # if ps[i]['keep'] == "both": + # shutil.copy2(self.split_filename_left(i), self.reorder_filename(ri)) + # ri += 1 + # i += 2 + # else: + # psk = ps[i]['keep'] + # if psk == "left" or psk == "both" or psk == "as_is": + # shutil.copy2(self.split_filename_left(i), self.reorder_filename(ri)) + # ri += 1 + # if psk == "right" or psk == "both": + # shutil.copy2(self.split_filename_right(i), self.reorder_filename(ri)) + # ri += 1 + # i += 1 def concate_files(self): writer = PdfWriter() diff --git a/plotting.py b/plotting.py index 09064dc..5685b06 100644 --- a/plotting.py +++ b/plotting.py @@ -157,6 +157,14 @@ class ImageViewer: self.root = root self.root.resizable(False, False) # If you resize, coordinates will be wrong + screen_w = root.winfo_screenwidth() + screen_h = root.winfo_screenheight() + + x = int(screen_w * 0.1) + y = int(screen_h * 0.05) + + root.geometry(f"+{x}+{y}") + self.base_dir = base_dir self.root.title("Bounding Box Viewer") self.label = tk.Label(root, text="Waiting for images...") diff --git a/prompting.py b/prompting.py index a2cf779..1c6beb4 100644 --- a/prompting.py +++ b/prompting.py @@ -88,6 +88,10 @@ do not score or give feedback to any other question.""" def make_prompt(input_dir,full_label): def read_longest_prefix_file(subdir): dir_path = input_dir / subdir + if not dir_path.exists(): + if subdir != "Persp": + print("Warning !! Directory doesn't exist : ", dir_path) + return "" matches = [f for f in dir_path.iterdir() if f.is_file() and full_label.startswith(f.name) diff --git a/rename_to_copie.sh b/rename_to_copie.sh index 4251c72..a4a0ef8 100755 --- a/rename_to_copie.sh +++ b/rename_to_copie.sh @@ -14,6 +14,11 @@ for file in *.pdf; do # Handle case where no pdfs exist [ -e "$file" ] || continue + if [ "$file" = "enonce.pdf" ]; then + echo "Skipping: $file" + continue + fi + # Rename with 0-padding (e.g., Copie01.pdf) mv -- "$file" "$(printf "Copie%02d.pdf" "$count")" ((count++)) diff --git a/rotate_all.sh b/rotate_all.sh index 0e1c92a..56e61c9 100755 --- a/rotate_all.sh +++ b/rotate_all.sh @@ -13,6 +13,11 @@ cd "$1" || { echo "Error: Cannot access directory '$1'"; exit 1; } shopt -s nullglob for file in *.pdf; do + if [ "$file" = "enonce.pdf" ]; then + echo "Skipping: $file" + continue + fi + # Rotate to a temporary file if qpdf --rotate=+180 "$file" "temp_rotated.pdf"; then mv "temp_rotated.pdf" "$file"