diff --git a/Readme.org b/Readme.org index 5e4f02a..3b87382 100644 --- a/Readme.org +++ b/Readme.org @@ -1,7 +1,7 @@ #+title: Script #+author: Sébastien Miquel #+date: 14-03-2026 -# Time-stamp: <14-05-26 08:55> +# Time-stamp: <17-05-26 10:51> #+OPTIONS: * Méta @@ -101,13 +101,13 @@ export GEMINI_API_KEY=… Les key bindings ne sont pas adaptés à un clavier azerty… À changer… - Fix issues with =python page_splitter.py Interro14/Copie01.pdf= + Fix issues with =python page_splitter.py Interro14/Copies/Copie01.pdf= 4. =python cutleft.py Interro= Découpe la partie gauche des copies, là où il devrait y avoir les labels des exercices/questions. - Rerun on a single file with =python cutleft.py Interro/Copie01.pdf= + Rerun on a single file with =python cutleft.py Interro/Copies/Copie01.pdf= ** Génération d'information sur l'énoncé @@ -136,7 +136,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~ + `|…` n'est pas arrêté verticalement par son type opposé. + `…|` est stoppé horizontalement par le `|…` le plus proche. Pour modifier une seule copie : - =python plotting.py Interro/Copie01.pdf= + =python plotting.py Interro/Copies/Copie01.pdf= It also generates les =Copie01.json=, à partir des =Copie01_01.json= 1. En cas de soucis, (par exemple les pages ne sont pas dans le bon ordre) @@ -147,6 +147,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~ 3. =python splitting_int.py Interro= Découpe les copies suivant les exercices + Peut-être appelé avec une seule copie. 4. =python grouping.py Interro= Regroupe les mêmes questions de différentes copies en groupes de @@ -159,7 +160,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~ 1. Il faut créer des persp, pour indication de comment corriger, et relancer =enonce_info.py= 2. =python correction.py Interro --limit 240= OU - =python correction.py Interro/Ex\ 2/Group_1.jpg= OU + =python correction.py Interro/Par\ label/Ex\ 2/Group_1.jpg= OU =python correction.py Interro --overwrite= =python correction.py Interro --pro-by-label= (needs `labels_for_pro`) diff --git a/annotating.py b/annotating.py index bb0f74a..4693b4a 100644 --- a/annotating.py +++ b/annotating.py @@ -37,7 +37,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]): # Find coordinates coordinates = None height,width= None, None - label_dir = os.path.join(root_dir, label) + label_dir = Path(root_dir) / "Par label" / label # Search all json files in Dir/label json_files = glob.glob(os.path.join(label_dir, "*.json")) @@ -59,7 +59,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]): break # Construct PDF path: Dir/Copie{id}/{label}.pdf - pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf") + pdf_path = Path(root_dir) / "Copies" / f"Copie{student_id}" / f"{label}.pdf" # Initialize dictionary structure for this ID if missing if student_id not in result_data: @@ -89,8 +89,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]): # On ajoute des dummies if labels_to_redo: # Si la liste est non vide for lbl in labels_to_redo: - pdf_path = os.path.join(root_dir, - f"Copie{sid}", f"{lbl}.pdf") + pdf_path = Path(root_dir) / "Copies" / f"Copie{sid}" / f"{lbl}.pdf" if not Path(pdf_path).exists(): print("Debug : asked to refaire", sid, lbl, "but pdf absent") continue @@ -107,8 +106,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]): else: # Ce student id n'a jamais été corrigé result_data[sid] = {} for lbl in labels_to_redo: - pdf_path = os.path.join(root_dir, - f"Copie{sid}", f"{lbl}.pdf") + pdf_path = Path(root_dir) / "Copies" / f"Copie{sid}" / f"{lbl}.pdf" if not pdf_path.exists(): print("Debug : asked to refaire", sid, lbl, "but pdf absent") continue @@ -567,13 +565,13 @@ def process_student(student_id, labels_data, root_dir, all_labels, overwrite): d_notes = dict.fromkeys(all_labels, "") label_images = [] + # !! Trier par l'ordre des labels plutôt sorted_labels = sorted(list(labels_data.items()), key=natural_key) for label, content in sorted_labels: # 1. Find PDF path copie_folder = f"Copie{student_id}" - pdf_rel_path = os.path.join(copie_folder, f"{label}.pdf") - pdf_full_path = os.path.join(root_dir, pdf_rel_path) + pdf_full_path = Path(root_dir) / "Copies" / copie_folder / f"{label}.pdf" if not os.path.exists(pdf_full_path): print(f"File not found: {pdf_full_path}") @@ -629,13 +627,14 @@ def process_correction(root_dir, data, all_labels, overwrite=False): # # Wait for all threads to complete # concurrent.futures.wait(futures) - # Ne pas thread cette applications + # Ne pas thread cette application # 1. Il faut protéger les appels à matplotlib # 2. tu vas perdre les erreurs for student_id, labels in sorted(data.items()): process_student(student_id, labels, root_dir, all_labels, overwrite) import argparse +import utils if __name__ == "__main__": parser = argparse.ArgumentParser(description="Annotate copies") @@ -644,7 +643,7 @@ if __name__ == "__main__": args = parser.parse_args() root_dir = args.root_dir - labels = list(filter(None, (Path(root_dir) / "labels").read_text().splitlines())) + labels = utils.read_all_labels(root_dir) results = make_dictionary(root_dir) # Results is : Copie id -> label -> {pdf_path, gemini_result, coordinates} # Coordinates are the real coordinates (hmin, hmax) of the image in the Group diff --git a/correction.py b/correction.py index e5d8aa1..e99c79e 100644 --- a/correction.py +++ b/correction.py @@ -38,14 +38,20 @@ for path_str in args.paths: # Handle individual file # Note: assumes structure InterroTest/Ex 2/Group_1.jpg to get parents[1] label = arg_path.parent.name + INPUT_DIR = arg_path.parent.parent.parent + COPIES_DIR = INPUT_DIR / "Copies" + GROUPS_DIR = INPUT_DIR / "Par label" tasks.append((str(arg_path), label)) if label not in results: results[label] = [] elif arg_path.is_dir(): + INPUT_DIR = arg_path + COPIES_DIR = INPUT_DIR / "Copies" + GROUPS_DIR = INPUT_DIR / "Par label" # Handle directory (original behavior) - for sub in arg_path.iterdir(): - if sub.is_dir() and sub.name.startswith("Ex"): + for sub in GROUPS_DIR.iterdir(): + if sub.is_dir(): label = sub.name if label not in results: results[label] = [] @@ -145,7 +151,7 @@ do not score or give feedback to any other question.""" def make_prompt(full_label): def read_longest_prefix_file(subdir): - dir_path = Path(INPUT_DIR) / subdir + dir_path = INPUT_DIR / subdir matches = [f for f in dir_path.iterdir() if f.is_file() and full_label.startswith(f.name) @@ -167,7 +173,6 @@ from google.genai import types import base64 import shlex import json -from pathlib import Path import os import threading import concurrent.futures @@ -210,7 +215,7 @@ def flush_thread_log(tid=None): tid = tid or threading.current_thread().name with log_lock: if thread_logs.get(tid): - with open(Path(INPUT_DIR) / "correction_log", "a", encoding="utf-8") as f: + with open(INPUT_DIR / "correction_log", "a", encoding="utf-8") as f: f.write(f"--- Task Log [{tid}] ---\n") f.write("\n".join(thread_logs[tid]) + "\n\n") thread_logs[tid].clear() @@ -311,8 +316,8 @@ def generate_request(file, full_label): return (contents, generate_content_config) client = genai.Client(api_key=api_key) -output_path = Path(INPUT_DIR) / "correction.json" -progress_path = Path(INPUT_DIR) / "correction_progress.json" +output_path = INPUT_DIR / "correction.json" +progress_path = INPUT_DIR / "correction_progress.json" start_time = time.time() overwrite = args.overwrite limit = args.limit @@ -407,9 +412,9 @@ def get_single_image_bytes(pdf_path): return img_byte_arr.getvalue() def correct_boxes_with_gemini(pid, label, original_feedbacks, - root_dir, yming, ymaxg, width_r, total_height): + yming, ymaxg, width_r, total_height): """Requests corrected bounding boxes from Gemini Flash on the single image.""" - pdf_path = Path(root_dir) / f"Copie{pid}" / f"{label}.pdf" + pdf_path = COPIES_DIR / f"Copie{pid}" / f"{label}.pdf" img_bytes = get_single_image_bytes(pdf_path) localized_feedbacks = [f for f in original_feedbacks if f["box_2d"]] @@ -473,9 +478,9 @@ it goes wrong, and the feedback is what went wrong. import shutil import grouping -def get_next_group_idx(root_dir, label): +def get_next_group_idx(label): """Finds the next available Group index for a given label.""" - target_folder = Path(root_dir) / label + target_folder = GROUPS_DIR / label target_folder.mkdir(exist_ok=True) existing = list(target_folder.glob("Group_*.jpg")) if not existing: return 0 @@ -489,7 +494,7 @@ def handle_label_errors(pid, label, res, pdf_path): error_type = res.get("error") all_labels = read_all_labels(INPUT_DIR) - labels_txt = (Path(INPUT_DIR) / "labels").read_text(encoding="utf-8", errors="replace") + labels_txt = (INPUT_DIR / "labels").read_text(encoding="utf-8", errors="replace") enonce = enonce_total(INPUT_DIR) if error_type == "wrong-label": @@ -523,7 +528,7 @@ Here is a list of all possible labels. You need to answer with one of these : if new_label == label: res["error"] = "" return [] - new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{new_label}.pdf" + new_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{new_label}.pdf" if new_pdf_path.exists(): tprint(f"\t\tCopie{pid} tried to move wrong {label} to {new_label}, but it already exists.") res["error"] = f"wrg-lbl:{new_label}?exists" @@ -533,12 +538,12 @@ Here is a list of all possible labels. You need to answer with one of these : shutil.move(str(pdf_path), str(new_pdf_path)) # Since we moved the file, this Copie/label should not be taken # into account in the future, I think - idx = get_next_group_idx(INPUT_DIR, new_label) + idx = get_next_group_idx(new_label) height = grouping.get_pdf_height(str(new_pdf_path)) grouping.create_jpg(new_label, idx, [(pid, str(new_pdf_path), height)], - INPUT_DIR) + GROUPS_DIR) tprint(f"\t\tMaking {new_label} group {idx+1}") - new_tasks.append((str(Path(INPUT_DIR) / new_label / f"Group_{idx+1}.jpg"), + new_tasks.append((str(GROUPS_DIR / new_label / f"Group_{idx+1}.jpg"), new_label, False)) elif error_type == "additional-answer": @@ -580,15 +585,15 @@ Here is a list of all possible labels. You need to answer with a list one of the error += f"{add_label}??" keep_error = True continue - new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{add_label}.pdf" + new_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{add_label}.pdf" if not new_pdf_path.exists(): shutil.copy(str(pdf_path), str(new_pdf_path)) tprint(f"\t\tCopying Copie{pid} : {label} -> {add_label}") - idx = get_next_group_idx(INPUT_DIR, add_label) + idx = get_next_group_idx(add_label) tprint(f"\t\tMaking {add_label} group {idx+1}") height = grouping.get_pdf_height(str(new_pdf_path)) - grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR) - new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"), + grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], GROUPS_DIR) + new_tasks.append((str(GROUPS_DIR / add_label / f"Group_{idx+1}.jpg"), add_label, False)) error += f"(->){add_label}" keep_error = True @@ -657,7 +662,7 @@ def process_single_task(task_tuple, precomputed_response=None): res = p["result"] yming, ymaxg, width_r = d_data[pid] - pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf" + pdf_path = COPIES_DIR / f"Copie{pid}" / f"{label}.pdf" if (not can_spawn_tasks) and res["error"] == "additional-answer": tprint("\tSwallowing an additional-answer from a subsequent task.") res["error"]= "" @@ -680,17 +685,22 @@ def process_single_task(task_tuple, precomputed_response=None): pid, label, group_name) continue - if (ymin < yming - 50 or - ymax > ymaxg + 50 or - xmax / 1000 > width_r): + if (ymin < yming - 50 or ymax > ymaxg + 50 or xmax / 1000 > width_r): needs_correction.append(i) break + if ymin < yming - 5: + ymin = yming - 5 + b[0] = ymin * 1000 // total_height + if ymax > ymaxg + 5: + ymax = ymaxg + 5 + b[2] = ymax * 1000 // total_height + if needs_correction: tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...") try: res["feedback"] = correct_boxes_with_gemini( - pid, label, res["feedback"], INPUT_DIR, + pid, label, res["feedback"], yming, ymaxg, width_r, total_height) except Exception as e: tprint(f"\tCorrection failed for Copie {pid}, {group_name} : {e}\n\tRemoving the boxes") @@ -726,8 +736,8 @@ def process_single_task(task_tuple, precomputed_response=None): if __name__ == "__main__": if args.refaire: - refaire_path = Path(INPUT_DIR) / "refaire.json" - overwritten_path = Path(INPUT_DIR) / "overwritten_correction.json" + refaire_path = INPUT_DIR / "refaire.json" + overwritten_path = INPUT_DIR / "overwritten_correction.json" if refaire_path.exists(): with open(refaire_path, "r", encoding="utf-8") as f: @@ -742,7 +752,7 @@ if __name__ == "__main__": for copie_name, labels in refaire_list: pid = copie_name.replace("Copie", "") - copie_dir = Path(INPUT_DIR) / copie_name + copie_dir = COPIES_DIR / copie_name # If list is empty, redo all labels available for this Copie if not labels: @@ -772,10 +782,10 @@ if __name__ == "__main__": # 2. Make new group and add to tasks pdf_path = copie_dir / f"{label}.pdf" if pdf_path.exists(): - idx = get_next_group_idx(INPUT_DIR, label) + idx = get_next_group_idx(label) height = grouping.get_pdf_height(str(pdf_path)) - grouping.create_jpg(label, idx, [(pid, str(pdf_path), height)], INPUT_DIR) - new_group_path = str(Path(INPUT_DIR) / label / f"Group_{idx+1}.jpg") + grouping.create_jpg(label, idx, [(pid, str(pdf_path), height)], GROUPS_DIR) + new_group_path = str(GROUPS_DIR / label / f"Group_{idx+1}.jpg") tasks_to_process.append((new_group_path, label)) if dirty_results: @@ -813,8 +823,8 @@ if __name__ == "__main__": tasks_to_process = [] # Run nothing live if just `--batch` if batch_tasks: - batch_flash_file = Path(INPUT_DIR) / "batch_requests_flash.jsonl" - batch_pro_file = Path(INPUT_DIR) / "batch_requests_pro.jsonl" + batch_flash_file = INPUT_DIR / "batch_requests_flash.jsonl" + batch_pro_file = INPUT_DIR / "batch_requests_pro.jsonl" count_flash = 0 count_pro = 0 @@ -873,7 +883,7 @@ if __name__ == "__main__": batched_responses = {} if args.deal_with_batched: - batch_results_path = Path(INPUT_DIR) / "batched_correction_result.jsonl" + batch_results_path = INPUT_DIR / "batched_correction_result.jsonl" if batch_results_path.exists(): print(f"Loading batch results from {batch_results_path}...") with open(batch_results_path, "r", encoding="utf-8") as f: diff --git a/cutleft.py b/cutleft.py index a098dcf..e1a985a 100644 --- a/cutleft.py +++ b/cutleft.py @@ -20,17 +20,35 @@ if len(sys.argv) < 2: path_arg = sys.argv[1] files = [] INPUT_DIR = "" +COPIES_DIR = "" if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'): - INPUT_DIR = os.path.dirname(path_arg) + COPIES_DIR = os.path.abspath(os.path.dirname(path_arg)) + # If the file is inside a "Copies" folder, set INPUT_DIR to the parent + if os.path.basename(COPIES_DIR).lower() == 'copies': + INPUT_DIR = os.path.dirname(COPIES_DIR) + else: + INPUT_DIR = COPIES_DIR files = [os.path.basename(path_arg)] elif os.path.isdir(path_arg): - INPUT_DIR = path_arg - files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf') and - "nonc" not in f.lower()]) + # Support passing either the base dir or the Copies dir directly + abs_path = os.path.abspath(path_arg) + if os.path.basename(abs_path).lower() == 'copies': + COPIES_DIR = abs_path + INPUT_DIR = os.path.dirname(abs_path) + else: + INPUT_DIR = abs_path + COPIES_DIR = os.path.join(INPUT_DIR, 'Copies') + + if os.path.exists(COPIES_DIR): + files = sorted([f for f in os.listdir(COPIES_DIR) if f.lower().endswith('.pdf') and + "nonc" not in f.lower()]) + else: + sys.exit(f"Error: Could not find 'Copies' directory inside {INPUT_DIR}") else: sys.exit("Error: Input must be a directory or a PDF file.") + OUTPUT_DIR = os.path.join(INPUT_DIR, 'Cutleft') if not os.path.exists(OUTPUT_DIR): @@ -90,7 +108,7 @@ pdf_cache_lock = threading.Lock() @lru_cache(maxsize=3) def _get_pdf_pages_cached(filename): - pdf_path = os.path.join(INPUT_DIR, filename) + pdf_path = os.path.join(COPIES_DIR, filename) return convert_from_path(pdf_path) def get_pdf_pages(filename): diff --git a/gemini_for_labels.py b/gemini_for_labels.py index cf996eb..7e1ece8 100644 --- a/gemini_for_labels.py +++ b/gemini_for_labels.py @@ -250,7 +250,7 @@ def process_copy_group(group_key, files): for image_file in files: start_time = time.time() base_name = image_file.stem - output_json = INPUT_DIR / f"{base_name}.json" + output_json = INPUT_DIR / "Copies" / f"{base_name}.json" # Check existing if output_json.exists() and not args.overwrite: diff --git a/grouping.py b/grouping.py index 0d79751..0008f04 100644 --- a/grouping.py +++ b/grouping.py @@ -3,6 +3,7 @@ import json import re import sys import shutil +from pathlib import Path from collections import defaultdict from concurrent.futures import ThreadPoolExecutor from PIL import Image, ImageDraw, ImageFont @@ -213,9 +214,9 @@ def create_jpg(identifier, group_index, group, root_dir): from utils import natural_key -def process_identifier(identifier, files_info, root_dir): +def process_identifier(identifier, files_info, output_dir): # Clear output directory if it exists - target_folder = os.path.join(root_dir, identifier) + target_folder = os.path.join(output_dir, identifier) if os.path.exists(target_folder): shutil.rmtree(target_folder) os.makedirs(target_folder, exist_ok=True) @@ -224,27 +225,31 @@ def process_identifier(identifier, files_info, root_dir): file_groups = group_files(files_info) for idx, group in enumerate(file_groups): - create_jpg(identifier, idx, group, root_dir) + create_jpg(identifier, idx, group, output_dir) def main(): if len(sys.argv) < 2: print("Usage: python app.py ") sys.exit(1) - root_dir = sys.argv[1] + root_dir = Path(sys.argv[1]) + + copies_dir = root_dir / "Copies" + par_label_dir = root_dir / "Par label" print("Scanning files...") - data = collect_files(root_dir) + data = collect_files(copies_dir) print(f"Found {len(data)} identifiers. Processing...") # Sort identifiers naturally sorted_identifiers = sorted(data.keys(), key=natural_key) - # Process using 4 threads - with ThreadPoolExecutor(max_workers=4) as executor: + # Process using 8 threads + with ThreadPoolExecutor(max_workers=8) as executor: for identifier in sorted_identifiers: - executor.submit(process_identifier, identifier, data[identifier], root_dir) + executor.submit(process_identifier, identifier, data[identifier], + par_label_dir) print("Done.") diff --git a/page_splitter.py b/page_splitter.py index 83ebf03..f0bbd39 100644 --- a/page_splitter.py +++ b/page_splitter.py @@ -63,6 +63,9 @@ class PDFPreviewer: # Check for existing original in backup and restore if found dir_name = os.path.dirname(os.path.abspath(path)) file_name = os.path.basename(path) + if os.path.basename(dir_name) == "Copies": + dir_name = os.path.dirname(dir_name) + path = os.path.join(dir_name, file_name) backup_path = os.path.join(dir_name, "Copies Originales", file_name) if os.path.exists(backup_path): @@ -313,9 +316,12 @@ class PDFPreviewer: file_name = os.path.basename(abs_path) backup_dir = os.path.join(dir_name, "Copies Originales") + copies_dir = os.path.join(dir_name, "Copies") os.makedirs(backup_dir, exist_ok=True) + os.makedirs(copies_dir, exist_ok=True) backup_path = os.path.join(backup_dir, file_name) + copies_path = os.path.join(copies_dir, file_name) # Remove backup if it already exists (overwrite) if os.path.exists(backup_path): @@ -325,7 +331,7 @@ class PDFPreviewer: shutil.move(self.pdf_path, backup_path) # Move the temp output file to replace the original - shutil.move(self.final_file, self.pdf_path) + shutil.move(self.final_file, copies_path) # print(f"Original moved to {backup_path}, new file saved at {self.pdf_path}") diff --git a/plotting.py b/plotting.py index 961c9fb..2c8e744 100644 --- a/plotting.py +++ b/plotting.py @@ -101,7 +101,7 @@ def worker_thread(base_dir, files_to_process, all_labels): previous_copie = None last_label_index = None for img_path in files_to_process: - json_path = base_dir / f"{img_path.stem}.json" + json_path = base_dir / "Copies" / f"{img_path.stem}.json" copie_part = int(img_path.stem[-2:]) copie = img_path.stem[:-3] if copie != previous_copie: @@ -222,7 +222,7 @@ class ImageViewer: def save_current_batch(self): """Writes the accumulated data to the main JSON file.""" if self.active_copie_name and self.accumulated_results: - main_json_path = self.base_dir / f"{self.active_copie_name}.json" + main_json_path = self.base_dir / "Copies" / f"{self.active_copie_name}.json" print(f"Writing aggregated result to {main_json_path}") with open(main_json_path, 'w') as f: json.dump(self.accumulated_results, f) @@ -327,7 +327,7 @@ class ImageViewer: def on_open_ori_pdf(self, event): if self.is_viewing and self.current_json_path: new_filename = self.current_json_path.stem.split('_')[0] + ".pdf" - pdf_path = self.current_json_path.parent / "Copies Originales" / new_filename + pdf_path = self.base_dir / "Copies Originales" / new_filename print(f"Opening {pdf_path}") subprocess.Popen(['xdg-open', str(pdf_path.absolute())]) @@ -363,20 +363,21 @@ if __name__ == "__main__": files_to_process = [] if input_path.is_file(): + # Correctly identify base_dir if we are in 'Copies' or 'Cutleft' + if input_path.parent.name in ["Copies", "Cutleft"]: + base_dir = input_path.parent.parent + else: + base_dir = input_path.parent - base_dir = input_path.parent stem = input_path.stem - img_path = base_dir / "Cutleft" / f"{stem}.jpg" - files_to_process = [img_path] - if not img_path.exists() and input_path.parent.name == "Cutleft": - base_dir = input_path.parent.parent - img_path = input_path - files_to_process = [img_path] - if not img_path.exists(): - # We're given Copie01.pdf, look for parts - cutleft_dir = base_dir / "Cutleft" - files_to_process = sorted(list(cutleft_dir.glob(f"{img_path.stem}_*.jpg")), - key=natural_key) + cutleft_dir = base_dir / "Cutleft" + img_path = cutleft_dir / f"{stem}.jpg" + + if img_path.exists(): + files_to_process = [img_path] + else: + # We're given something like Copie01.pdf, look for its split image parts + files_to_process = sorted(list(cutleft_dir.glob(f"{stem}_*.jpg")), key=natural_key) else: base_dir = input_path cutleft_dir = base_dir / "Cutleft" diff --git a/reading_annotations.py b/reading_annotations.py index 14b811d..d507649 100644 --- a/reading_annotations.py +++ b/reading_annotations.py @@ -3,6 +3,7 @@ import os import json import numpy as np import shutil +from pathlib import Path from PIL import Image, ImageChops, ImageFilter Image.MAX_IMAGE_PIXELS = None from pdf2image import convert_from_path @@ -99,7 +100,7 @@ def detect_checks_and_notes(output_dir): density = changed_pixels / roi.size if density > DENSITY_THRESHOLD: - print("A checked box !", density, b) + # print("A checked box !", density, b) actions.append(box) # It's checked, so we mask this area out for manual notes # Expand mask slightly to catch sloppy ticks @@ -254,7 +255,7 @@ def apply_actions_and_regenerate(root_dir, data, student_id, actions, notes_laye # B. Regenerate Label Image # We always regenerate to ensure Concat.jpg is consistent with any modifications - pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf") + pdf_path = Path(root_dir) / "Copies" / f"Copie{student_id}" / f"{label}.pdf" if not os.path.exists(pdf_path): continue (base_img, _, _) = annotating.make_base_image(pdf_path) @@ -328,7 +329,6 @@ def apply_actions_and_regenerate(root_dir, data, student_id, actions, notes_laye full_img.save(os.path.join(output_dir, "Concat_F.jpg")) print(f" Saved regenerated Concat_F.jpg") -from pathlib import Path from utils import read_all_labels if __name__ == "__main__": if len(sys.argv) < 2: diff --git a/reading_grouped_annotations.py b/reading_grouped_annotations.py index e89b520..25f10f9 100644 --- a/reading_grouped_annotations.py +++ b/reading_grouped_annotations.py @@ -85,7 +85,8 @@ def save_paginated_pdf(image_groups, output_path): if pages: pages[0].save(output_path, "PDF", resolution=100.0, save_all=True, append_images=pages[1:]) -def apply_actions_and_regenerate_grouped(root_dir, data, student_id, actions, label_notes, all_labels): +def apply_actions_and_regenerate_grouped(root_dir, data, student_id, + actions, label_notes, all_labels): """ Modifies data based on actions, pastes label-specific note crops, regenerates label images for consistency, saves dirty ones, @@ -161,7 +162,7 @@ def apply_actions_and_regenerate_grouped(root_dir, data, student_id, actions, la result = content['result'] d_notes[label] = str(result.get('score', 0)) - pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf") + pdf_path = Path(root_dir) / "Copies" / f"Copie{student_id}" / f"{label}.pdf" if not os.path.exists(pdf_path): continue (base_img, _, _) = annotating.make_base_image(pdf_path) @@ -204,13 +205,15 @@ def apply_actions_and_regenerate_grouped(root_dir, data, student_id, actions, la concat_list.append(final_img) perfect_no_comment = True - if float(d_notes[label]) != 4.0: + if float(d_notes[label]) < 4.0: perfect_no_comment = False else: - if len(result.get('feedback', [])) != 0: - perfect_no_comment = False + lfb = result.get('feedback', []) + for e in lfb: + if "to_delete" not in e or not e["to_delete"]: + perfect_no_comment = False - if not perfect_no_comment: + if not perfect_no_comment or has_notes: extras = get_extra_pdfs_as_images(root_dir, label, annotating) extras.append(final_img) concat_list_F.append(extras) @@ -333,7 +336,8 @@ if __name__ == "__main__": if hmax > hmin: crop = notes_img.crop((0, hmin, notes_img.width, hmax)) if has_significant_notes(crop): - notes_by_student[sid][lbl] = {'img': crop, 'old_header_h': img_info.get("header_height", 0)} + notes_by_student[sid][lbl] = {'img': crop, + 'old_header_h': img_info.get("header_height", 0)} def process_refaire_entry(sid, r_labels): @@ -364,7 +368,9 @@ if __name__ == "__main__": if hmax > hmin: crop = b_notes_img.crop((0, hmin, b_notes_img.width, hmax)) if has_significant_notes(crop): - notes_by_student[sid][lbl] = {'img': crop, 'old_header_h': img_info.get("header_height", 0)} + notes_by_student[sid][lbl] = \ + {'img': crop, + 'old_header_h': img_info.get("header_height", 0)} diff --git a/splitting_int.py b/splitting_int.py index a326f5a..bd25442 100644 --- a/splitting_int.py +++ b/splitting_int.py @@ -37,7 +37,7 @@ def decode_json(pdf_file): def split_an_interro(base_dir, input_pdf, coords_list): doc = fitz.open(input_pdf) - output_dir = base_dir / input_pdf.stem + output_dir = base_dir / "Copies" / input_pdf.stem generated_files = set() parts_by_label = defaultdict(list) @@ -197,10 +197,13 @@ if __name__ == "__main__": if input_arg.is_file(): base_dir = input_arg.parent + if base_dir.name == "Copies": + base_dir = base_dir.parent pdf_files = [input_arg] elif input_arg.is_dir(): base_dir = input_arg - pdf_files = sorted(base_dir.glob("*.pdf")) + copies_dir = base_dir / "Copies" + pdf_files = sorted(copies_dir.glob("*.pdf")) else: print(f"Error: {input_arg} is not a valid file or directory.") sys.exit(1) diff --git a/utils.py b/utils.py index d93b282..5919e8a 100644 --- a/utils.py +++ b/utils.py @@ -5,9 +5,7 @@ def natural_key(text): return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))] def read_all_labels(base_dir): - return sorted(list(filter(None, - (Path(base_dir) / "labels").read_text().splitlines())), - key = natural_key) + return list(filter(None, (Path(base_dir) / "labels").read_text().splitlines())) def enonce_total(base_dir): text_dir = Path(base_dir) / 'Text'