Using folders "Copies" and "Par label", hopefully

master
Sébastien Miquel 2026-05-17 11:25:36 +02:00
parent 7e7045293a
commit c6c1a052e1
12 changed files with 142 additions and 95 deletions

View File

@ -1,7 +1,7 @@
#+title: Script #+title: Script
#+author: Sébastien Miquel #+author: Sébastien Miquel
#+date: 14-03-2026 #+date: 14-03-2026
# Time-stamp: <14-05-26 08:55> # Time-stamp: <17-05-26 10:51>
#+OPTIONS: #+OPTIONS:
* Méta * Méta
@ -101,13 +101,13 @@ export GEMINI_API_KEY=…
Les key bindings ne sont pas adaptés à un clavier azerty… À changer… Les key bindings ne sont pas adaptés à un clavier azerty… À changer…
Fix issues with =python page_splitter.py Interro14/Copie01.pdf= Fix issues with =python page_splitter.py Interro14/Copies/Copie01.pdf=
4. =python cutleft.py Interro= 4. =python cutleft.py Interro=
Découpe la partie gauche des copies, là où il devrait y avoir les Découpe la partie gauche des copies, là où il devrait y avoir les
labels des exercices/questions. labels des exercices/questions.
Rerun on a single file with =python cutleft.py Interro/Copie01.pdf= Rerun on a single file with =python cutleft.py Interro/Copies/Copie01.pdf=
** Génération d'information sur l'énoncé ** Génération d'information sur l'énoncé
@ -136,7 +136,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
+ `|…` n'est pas arrêté verticalement par son type opposé. + `|…` n'est pas arrêté verticalement par son type opposé.
+ `…|` est stoppé horizontalement par le `|…` le plus proche. + `…|` est stoppé horizontalement par le `|…` le plus proche.
Pour modifier une seule copie : Pour modifier une seule copie :
=python plotting.py Interro/Copie01.pdf= =python plotting.py Interro/Copies/Copie01.pdf=
It also generates les =Copie01.json=, à partir des =Copie01_01.json= It also generates les =Copie01.json=, à partir des =Copie01_01.json=
1. En cas de soucis, (par exemple les pages ne sont pas dans le bon ordre) 1. En cas de soucis, (par exemple les pages ne sont pas dans le bon ordre)
@ -147,6 +147,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
3. =python splitting_int.py Interro= 3. =python splitting_int.py Interro=
Découpe les copies suivant les exercices Découpe les copies suivant les exercices
Peut-être appelé avec une seule copie.
4. =python grouping.py Interro= 4. =python grouping.py Interro=
Regroupe les mêmes questions de différentes copies en groupes de Regroupe les mêmes questions de différentes copies en groupes de
@ -159,7 +160,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
1. Il faut créer des persp, pour indication de comment corriger, et 1. Il faut créer des persp, pour indication de comment corriger, et
relancer =enonce_info.py= relancer =enonce_info.py=
2. =python correction.py Interro --limit 240= OU 2. =python correction.py Interro --limit 240= OU
=python correction.py Interro/Ex\ 2/Group_1.jpg= OU =python correction.py Interro/Par\ label/Ex\ 2/Group_1.jpg= OU
=python correction.py Interro --overwrite= =python correction.py Interro --overwrite=
=python correction.py Interro --pro-by-label= (needs `labels_for_pro`) =python correction.py Interro --pro-by-label= (needs `labels_for_pro`)

View File

@ -37,7 +37,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]):
# Find coordinates # Find coordinates
coordinates = None coordinates = None
height,width= None, None height,width= None, None
label_dir = os.path.join(root_dir, label) label_dir = Path(root_dir) / "Par label" / label
# Search all json files in Dir/label # Search all json files in Dir/label
json_files = glob.glob(os.path.join(label_dir, "*.json")) json_files = glob.glob(os.path.join(label_dir, "*.json"))
@ -59,7 +59,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]):
break break
# Construct PDF path: Dir/Copie{id}/{label}.pdf # Construct PDF path: Dir/Copie{id}/{label}.pdf
pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf") pdf_path = Path(root_dir) / "Copies" / f"Copie{student_id}" / f"{label}.pdf"
# Initialize dictionary structure for this ID if missing # Initialize dictionary structure for this ID if missing
if student_id not in result_data: if student_id not in result_data:
@ -89,8 +89,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]):
# On ajoute des dummies # On ajoute des dummies
if labels_to_redo: # Si la liste est non vide if labels_to_redo: # Si la liste est non vide
for lbl in labels_to_redo: for lbl in labels_to_redo:
pdf_path = os.path.join(root_dir, pdf_path = Path(root_dir) / "Copies" / f"Copie{sid}" / f"{lbl}.pdf"
f"Copie{sid}", f"{lbl}.pdf")
if not Path(pdf_path).exists(): if not Path(pdf_path).exists():
print("Debug : asked to refaire", sid, lbl, "but pdf absent") print("Debug : asked to refaire", sid, lbl, "but pdf absent")
continue continue
@ -107,8 +106,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]):
else: # Ce student id n'a jamais été corrigé else: # Ce student id n'a jamais été corrigé
result_data[sid] = {} result_data[sid] = {}
for lbl in labels_to_redo: for lbl in labels_to_redo:
pdf_path = os.path.join(root_dir, pdf_path = Path(root_dir) / "Copies" / f"Copie{sid}" / f"{lbl}.pdf"
f"Copie{sid}", f"{lbl}.pdf")
if not pdf_path.exists(): if not pdf_path.exists():
print("Debug : asked to refaire", sid, lbl, "but pdf absent") print("Debug : asked to refaire", sid, lbl, "but pdf absent")
continue continue
@ -567,13 +565,13 @@ def process_student(student_id, labels_data, root_dir, all_labels, overwrite):
d_notes = dict.fromkeys(all_labels, "") d_notes = dict.fromkeys(all_labels, "")
label_images = [] label_images = []
# !! Trier par l'ordre des labels plutôt
sorted_labels = sorted(list(labels_data.items()), key=natural_key) sorted_labels = sorted(list(labels_data.items()), key=natural_key)
for label, content in sorted_labels: for label, content in sorted_labels:
# 1. Find PDF path # 1. Find PDF path
copie_folder = f"Copie{student_id}" copie_folder = f"Copie{student_id}"
pdf_rel_path = os.path.join(copie_folder, f"{label}.pdf") pdf_full_path = Path(root_dir) / "Copies" / copie_folder / f"{label}.pdf"
pdf_full_path = os.path.join(root_dir, pdf_rel_path)
if not os.path.exists(pdf_full_path): if not os.path.exists(pdf_full_path):
print(f"File not found: {pdf_full_path}") print(f"File not found: {pdf_full_path}")
@ -629,13 +627,14 @@ def process_correction(root_dir, data, all_labels, overwrite=False):
# # Wait for all threads to complete # # Wait for all threads to complete
# concurrent.futures.wait(futures) # concurrent.futures.wait(futures)
# Ne pas thread cette applications # Ne pas thread cette application
# 1. Il faut protéger les appels à matplotlib # 1. Il faut protéger les appels à matplotlib
# 2. tu vas perdre les erreurs # 2. tu vas perdre les erreurs
for student_id, labels in sorted(data.items()): for student_id, labels in sorted(data.items()):
process_student(student_id, labels, root_dir, all_labels, overwrite) process_student(student_id, labels, root_dir, all_labels, overwrite)
import argparse import argparse
import utils
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Annotate copies") parser = argparse.ArgumentParser(description="Annotate copies")
@ -644,7 +643,7 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
root_dir = args.root_dir root_dir = args.root_dir
labels = list(filter(None, (Path(root_dir) / "labels").read_text().splitlines())) labels = utils.read_all_labels(root_dir)
results = make_dictionary(root_dir) results = make_dictionary(root_dir)
# Results is : Copie id -> label -> {pdf_path, gemini_result, coordinates} # Results is : Copie id -> label -> {pdf_path, gemini_result, coordinates}
# Coordinates are the real coordinates (hmin, hmax) of the image in the Group # Coordinates are the real coordinates (hmin, hmax) of the image in the Group

View File

@ -38,14 +38,20 @@ for path_str in args.paths:
# Handle individual file # Handle individual file
# Note: assumes structure InterroTest/Ex 2/Group_1.jpg to get parents[1] # Note: assumes structure InterroTest/Ex 2/Group_1.jpg to get parents[1]
label = arg_path.parent.name label = arg_path.parent.name
INPUT_DIR = arg_path.parent.parent.parent
COPIES_DIR = INPUT_DIR / "Copies"
GROUPS_DIR = INPUT_DIR / "Par label"
tasks.append((str(arg_path), label)) tasks.append((str(arg_path), label))
if label not in results: if label not in results:
results[label] = [] results[label] = []
elif arg_path.is_dir(): elif arg_path.is_dir():
INPUT_DIR = arg_path
COPIES_DIR = INPUT_DIR / "Copies"
GROUPS_DIR = INPUT_DIR / "Par label"
# Handle directory (original behavior) # Handle directory (original behavior)
for sub in arg_path.iterdir(): for sub in GROUPS_DIR.iterdir():
if sub.is_dir() and sub.name.startswith("Ex"): if sub.is_dir():
label = sub.name label = sub.name
if label not in results: if label not in results:
results[label] = [] results[label] = []
@ -145,7 +151,7 @@ do not score or give feedback to any other question."""
def make_prompt(full_label): def make_prompt(full_label):
def read_longest_prefix_file(subdir): def read_longest_prefix_file(subdir):
dir_path = Path(INPUT_DIR) / subdir dir_path = INPUT_DIR / subdir
matches = [f for f in dir_path.iterdir() matches = [f for f in dir_path.iterdir()
if f.is_file() if f.is_file()
and full_label.startswith(f.name) and full_label.startswith(f.name)
@ -167,7 +173,6 @@ from google.genai import types
import base64 import base64
import shlex import shlex
import json import json
from pathlib import Path
import os import os
import threading import threading
import concurrent.futures import concurrent.futures
@ -210,7 +215,7 @@ def flush_thread_log(tid=None):
tid = tid or threading.current_thread().name tid = tid or threading.current_thread().name
with log_lock: with log_lock:
if thread_logs.get(tid): if thread_logs.get(tid):
with open(Path(INPUT_DIR) / "correction_log", "a", encoding="utf-8") as f: with open(INPUT_DIR / "correction_log", "a", encoding="utf-8") as f:
f.write(f"--- Task Log [{tid}] ---\n") f.write(f"--- Task Log [{tid}] ---\n")
f.write("\n".join(thread_logs[tid]) + "\n\n") f.write("\n".join(thread_logs[tid]) + "\n\n")
thread_logs[tid].clear() thread_logs[tid].clear()
@ -311,8 +316,8 @@ def generate_request(file, full_label):
return (contents, generate_content_config) return (contents, generate_content_config)
client = genai.Client(api_key=api_key) client = genai.Client(api_key=api_key)
output_path = Path(INPUT_DIR) / "correction.json" output_path = INPUT_DIR / "correction.json"
progress_path = Path(INPUT_DIR) / "correction_progress.json" progress_path = INPUT_DIR / "correction_progress.json"
start_time = time.time() start_time = time.time()
overwrite = args.overwrite overwrite = args.overwrite
limit = args.limit limit = args.limit
@ -407,9 +412,9 @@ def get_single_image_bytes(pdf_path):
return img_byte_arr.getvalue() return img_byte_arr.getvalue()
def correct_boxes_with_gemini(pid, label, original_feedbacks, def correct_boxes_with_gemini(pid, label, original_feedbacks,
root_dir, yming, ymaxg, width_r, total_height): yming, ymaxg, width_r, total_height):
"""Requests corrected bounding boxes from Gemini Flash on the single image.""" """Requests corrected bounding boxes from Gemini Flash on the single image."""
pdf_path = Path(root_dir) / f"Copie{pid}" / f"{label}.pdf" pdf_path = COPIES_DIR / f"Copie{pid}" / f"{label}.pdf"
img_bytes = get_single_image_bytes(pdf_path) img_bytes = get_single_image_bytes(pdf_path)
localized_feedbacks = [f for f in original_feedbacks if f["box_2d"]] localized_feedbacks = [f for f in original_feedbacks if f["box_2d"]]
@ -473,9 +478,9 @@ it goes wrong, and the feedback is what went wrong.
import shutil import shutil
import grouping import grouping
def get_next_group_idx(root_dir, label): def get_next_group_idx(label):
"""Finds the next available Group index for a given label.""" """Finds the next available Group index for a given label."""
target_folder = Path(root_dir) / label target_folder = GROUPS_DIR / label
target_folder.mkdir(exist_ok=True) target_folder.mkdir(exist_ok=True)
existing = list(target_folder.glob("Group_*.jpg")) existing = list(target_folder.glob("Group_*.jpg"))
if not existing: return 0 if not existing: return 0
@ -489,7 +494,7 @@ def handle_label_errors(pid, label, res, pdf_path):
error_type = res.get("error") error_type = res.get("error")
all_labels = read_all_labels(INPUT_DIR) all_labels = read_all_labels(INPUT_DIR)
labels_txt = (Path(INPUT_DIR) / "labels").read_text(encoding="utf-8", errors="replace") labels_txt = (INPUT_DIR / "labels").read_text(encoding="utf-8", errors="replace")
enonce = enonce_total(INPUT_DIR) enonce = enonce_total(INPUT_DIR)
if error_type == "wrong-label": if error_type == "wrong-label":
@ -523,7 +528,7 @@ Here is a list of all possible labels. You need to answer with one of these :
if new_label == label: if new_label == label:
res["error"] = "" res["error"] = ""
return [] return []
new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{new_label}.pdf" new_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{new_label}.pdf"
if new_pdf_path.exists(): if new_pdf_path.exists():
tprint(f"\t\tCopie{pid} tried to move wrong {label} to {new_label}, but it already exists.") tprint(f"\t\tCopie{pid} tried to move wrong {label} to {new_label}, but it already exists.")
res["error"] = f"wrg-lbl:{new_label}?exists" res["error"] = f"wrg-lbl:{new_label}?exists"
@ -533,12 +538,12 @@ Here is a list of all possible labels. You need to answer with one of these :
shutil.move(str(pdf_path), str(new_pdf_path)) shutil.move(str(pdf_path), str(new_pdf_path))
# Since we moved the file, this Copie/label should not be taken # Since we moved the file, this Copie/label should not be taken
# into account in the future, I think # into account in the future, I think
idx = get_next_group_idx(INPUT_DIR, new_label) idx = get_next_group_idx(new_label)
height = grouping.get_pdf_height(str(new_pdf_path)) height = grouping.get_pdf_height(str(new_pdf_path))
grouping.create_jpg(new_label, idx, [(pid, str(new_pdf_path), height)], grouping.create_jpg(new_label, idx, [(pid, str(new_pdf_path), height)],
INPUT_DIR) GROUPS_DIR)
tprint(f"\t\tMaking {new_label} group {idx+1}") tprint(f"\t\tMaking {new_label} group {idx+1}")
new_tasks.append((str(Path(INPUT_DIR) / new_label / f"Group_{idx+1}.jpg"), new_tasks.append((str(GROUPS_DIR / new_label / f"Group_{idx+1}.jpg"),
new_label, False)) new_label, False))
elif error_type == "additional-answer": elif error_type == "additional-answer":
@ -580,15 +585,15 @@ Here is a list of all possible labels. You need to answer with a list one of the
error += f"{add_label}??" error += f"{add_label}??"
keep_error = True keep_error = True
continue continue
new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{add_label}.pdf" new_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{add_label}.pdf"
if not new_pdf_path.exists(): if not new_pdf_path.exists():
shutil.copy(str(pdf_path), str(new_pdf_path)) shutil.copy(str(pdf_path), str(new_pdf_path))
tprint(f"\t\tCopying Copie{pid} : {label} -> {add_label}") tprint(f"\t\tCopying Copie{pid} : {label} -> {add_label}")
idx = get_next_group_idx(INPUT_DIR, add_label) idx = get_next_group_idx(add_label)
tprint(f"\t\tMaking {add_label} group {idx+1}") tprint(f"\t\tMaking {add_label} group {idx+1}")
height = grouping.get_pdf_height(str(new_pdf_path)) height = grouping.get_pdf_height(str(new_pdf_path))
grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR) grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], GROUPS_DIR)
new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"), new_tasks.append((str(GROUPS_DIR / add_label / f"Group_{idx+1}.jpg"),
add_label, False)) add_label, False))
error += f"(->){add_label}" error += f"(->){add_label}"
keep_error = True keep_error = True
@ -657,7 +662,7 @@ def process_single_task(task_tuple, precomputed_response=None):
res = p["result"] res = p["result"]
yming, ymaxg, width_r = d_data[pid] yming, ymaxg, width_r = d_data[pid]
pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf" pdf_path = COPIES_DIR / f"Copie{pid}" / f"{label}.pdf"
if (not can_spawn_tasks) and res["error"] == "additional-answer": if (not can_spawn_tasks) and res["error"] == "additional-answer":
tprint("\tSwallowing an additional-answer from a subsequent task.") tprint("\tSwallowing an additional-answer from a subsequent task.")
res["error"]= "" res["error"]= ""
@ -680,17 +685,22 @@ def process_single_task(task_tuple, precomputed_response=None):
pid, label, group_name) pid, label, group_name)
continue continue
if (ymin < yming - 50 or if (ymin < yming - 50 or ymax > ymaxg + 50 or xmax / 1000 > width_r):
ymax > ymaxg + 50 or
xmax / 1000 > width_r):
needs_correction.append(i) needs_correction.append(i)
break break
if ymin < yming - 5:
ymin = yming - 5
b[0] = ymin * 1000 // total_height
if ymax > ymaxg + 5:
ymax = ymaxg + 5
b[2] = ymax * 1000 // total_height
if needs_correction: if needs_correction:
tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...") tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...")
try: try:
res["feedback"] = correct_boxes_with_gemini( res["feedback"] = correct_boxes_with_gemini(
pid, label, res["feedback"], INPUT_DIR, pid, label, res["feedback"],
yming, ymaxg, width_r, total_height) yming, ymaxg, width_r, total_height)
except Exception as e: except Exception as e:
tprint(f"\tCorrection failed for Copie {pid}, {group_name} : {e}\n\tRemoving the boxes") tprint(f"\tCorrection failed for Copie {pid}, {group_name} : {e}\n\tRemoving the boxes")
@ -726,8 +736,8 @@ def process_single_task(task_tuple, precomputed_response=None):
if __name__ == "__main__": if __name__ == "__main__":
if args.refaire: if args.refaire:
refaire_path = Path(INPUT_DIR) / "refaire.json" refaire_path = INPUT_DIR / "refaire.json"
overwritten_path = Path(INPUT_DIR) / "overwritten_correction.json" overwritten_path = INPUT_DIR / "overwritten_correction.json"
if refaire_path.exists(): if refaire_path.exists():
with open(refaire_path, "r", encoding="utf-8") as f: with open(refaire_path, "r", encoding="utf-8") as f:
@ -742,7 +752,7 @@ if __name__ == "__main__":
for copie_name, labels in refaire_list: for copie_name, labels in refaire_list:
pid = copie_name.replace("Copie", "") pid = copie_name.replace("Copie", "")
copie_dir = Path(INPUT_DIR) / copie_name copie_dir = COPIES_DIR / copie_name
# If list is empty, redo all labels available for this Copie # If list is empty, redo all labels available for this Copie
if not labels: if not labels:
@ -772,10 +782,10 @@ if __name__ == "__main__":
# 2. Make new group and add to tasks # 2. Make new group and add to tasks
pdf_path = copie_dir / f"{label}.pdf" pdf_path = copie_dir / f"{label}.pdf"
if pdf_path.exists(): if pdf_path.exists():
idx = get_next_group_idx(INPUT_DIR, label) idx = get_next_group_idx(label)
height = grouping.get_pdf_height(str(pdf_path)) height = grouping.get_pdf_height(str(pdf_path))
grouping.create_jpg(label, idx, [(pid, str(pdf_path), height)], INPUT_DIR) grouping.create_jpg(label, idx, [(pid, str(pdf_path), height)], GROUPS_DIR)
new_group_path = str(Path(INPUT_DIR) / label / f"Group_{idx+1}.jpg") new_group_path = str(GROUPS_DIR / label / f"Group_{idx+1}.jpg")
tasks_to_process.append((new_group_path, label)) tasks_to_process.append((new_group_path, label))
if dirty_results: if dirty_results:
@ -813,8 +823,8 @@ if __name__ == "__main__":
tasks_to_process = [] # Run nothing live if just `--batch` tasks_to_process = [] # Run nothing live if just `--batch`
if batch_tasks: if batch_tasks:
batch_flash_file = Path(INPUT_DIR) / "batch_requests_flash.jsonl" batch_flash_file = INPUT_DIR / "batch_requests_flash.jsonl"
batch_pro_file = Path(INPUT_DIR) / "batch_requests_pro.jsonl" batch_pro_file = INPUT_DIR / "batch_requests_pro.jsonl"
count_flash = 0 count_flash = 0
count_pro = 0 count_pro = 0
@ -873,7 +883,7 @@ if __name__ == "__main__":
batched_responses = {} batched_responses = {}
if args.deal_with_batched: if args.deal_with_batched:
batch_results_path = Path(INPUT_DIR) / "batched_correction_result.jsonl" batch_results_path = INPUT_DIR / "batched_correction_result.jsonl"
if batch_results_path.exists(): if batch_results_path.exists():
print(f"Loading batch results from {batch_results_path}...") print(f"Loading batch results from {batch_results_path}...")
with open(batch_results_path, "r", encoding="utf-8") as f: with open(batch_results_path, "r", encoding="utf-8") as f:

View File

@ -20,17 +20,35 @@ if len(sys.argv) < 2:
path_arg = sys.argv[1] path_arg = sys.argv[1]
files = [] files = []
INPUT_DIR = "" INPUT_DIR = ""
COPIES_DIR = ""
if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'): if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'):
INPUT_DIR = os.path.dirname(path_arg) COPIES_DIR = os.path.abspath(os.path.dirname(path_arg))
# If the file is inside a "Copies" folder, set INPUT_DIR to the parent
if os.path.basename(COPIES_DIR).lower() == 'copies':
INPUT_DIR = os.path.dirname(COPIES_DIR)
else:
INPUT_DIR = COPIES_DIR
files = [os.path.basename(path_arg)] files = [os.path.basename(path_arg)]
elif os.path.isdir(path_arg): elif os.path.isdir(path_arg):
INPUT_DIR = path_arg # Support passing either the base dir or the Copies dir directly
files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf') and abs_path = os.path.abspath(path_arg)
if os.path.basename(abs_path).lower() == 'copies':
COPIES_DIR = abs_path
INPUT_DIR = os.path.dirname(abs_path)
else:
INPUT_DIR = abs_path
COPIES_DIR = os.path.join(INPUT_DIR, 'Copies')
if os.path.exists(COPIES_DIR):
files = sorted([f for f in os.listdir(COPIES_DIR) if f.lower().endswith('.pdf') and
"nonc" not in f.lower()]) "nonc" not in f.lower()])
else:
sys.exit(f"Error: Could not find 'Copies' directory inside {INPUT_DIR}")
else: else:
sys.exit("Error: Input must be a directory or a PDF file.") sys.exit("Error: Input must be a directory or a PDF file.")
OUTPUT_DIR = os.path.join(INPUT_DIR, 'Cutleft') OUTPUT_DIR = os.path.join(INPUT_DIR, 'Cutleft')
if not os.path.exists(OUTPUT_DIR): if not os.path.exists(OUTPUT_DIR):
@ -90,7 +108,7 @@ pdf_cache_lock = threading.Lock()
@lru_cache(maxsize=3) @lru_cache(maxsize=3)
def _get_pdf_pages_cached(filename): def _get_pdf_pages_cached(filename):
pdf_path = os.path.join(INPUT_DIR, filename) pdf_path = os.path.join(COPIES_DIR, filename)
return convert_from_path(pdf_path) return convert_from_path(pdf_path)
def get_pdf_pages(filename): def get_pdf_pages(filename):

View File

@ -250,7 +250,7 @@ def process_copy_group(group_key, files):
for image_file in files: for image_file in files:
start_time = time.time() start_time = time.time()
base_name = image_file.stem base_name = image_file.stem
output_json = INPUT_DIR / f"{base_name}.json" output_json = INPUT_DIR / "Copies" / f"{base_name}.json"
# Check existing # Check existing
if output_json.exists() and not args.overwrite: if output_json.exists() and not args.overwrite:

View File

@ -3,6 +3,7 @@ import json
import re import re
import sys import sys
import shutil import shutil
from pathlib import Path
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
@ -213,9 +214,9 @@ def create_jpg(identifier, group_index, group, root_dir):
from utils import natural_key from utils import natural_key
def process_identifier(identifier, files_info, root_dir): def process_identifier(identifier, files_info, output_dir):
# Clear output directory if it exists # Clear output directory if it exists
target_folder = os.path.join(root_dir, identifier) target_folder = os.path.join(output_dir, identifier)
if os.path.exists(target_folder): if os.path.exists(target_folder):
shutil.rmtree(target_folder) shutil.rmtree(target_folder)
os.makedirs(target_folder, exist_ok=True) os.makedirs(target_folder, exist_ok=True)
@ -224,27 +225,31 @@ def process_identifier(identifier, files_info, root_dir):
file_groups = group_files(files_info) file_groups = group_files(files_info)
for idx, group in enumerate(file_groups): for idx, group in enumerate(file_groups):
create_jpg(identifier, idx, group, root_dir) create_jpg(identifier, idx, group, output_dir)
def main(): def main():
if len(sys.argv) < 2: if len(sys.argv) < 2:
print("Usage: python app.py <Path_to_Dir>") print("Usage: python app.py <Path_to_Dir>")
sys.exit(1) sys.exit(1)
root_dir = sys.argv[1] root_dir = Path(sys.argv[1])
copies_dir = root_dir / "Copies"
par_label_dir = root_dir / "Par label"
print("Scanning files...") print("Scanning files...")
data = collect_files(root_dir) data = collect_files(copies_dir)
print(f"Found {len(data)} identifiers. Processing...") print(f"Found {len(data)} identifiers. Processing...")
# Sort identifiers naturally # Sort identifiers naturally
sorted_identifiers = sorted(data.keys(), key=natural_key) sorted_identifiers = sorted(data.keys(), key=natural_key)
# Process using 4 threads # Process using 8 threads
with ThreadPoolExecutor(max_workers=4) as executor: with ThreadPoolExecutor(max_workers=8) as executor:
for identifier in sorted_identifiers: for identifier in sorted_identifiers:
executor.submit(process_identifier, identifier, data[identifier], root_dir) executor.submit(process_identifier, identifier, data[identifier],
par_label_dir)
print("Done.") print("Done.")

View File

@ -63,6 +63,9 @@ class PDFPreviewer:
# Check for existing original in backup and restore if found # Check for existing original in backup and restore if found
dir_name = os.path.dirname(os.path.abspath(path)) dir_name = os.path.dirname(os.path.abspath(path))
file_name = os.path.basename(path) file_name = os.path.basename(path)
if os.path.basename(dir_name) == "Copies":
dir_name = os.path.dirname(dir_name)
path = os.path.join(dir_name, file_name)
backup_path = os.path.join(dir_name, "Copies Originales", file_name) backup_path = os.path.join(dir_name, "Copies Originales", file_name)
if os.path.exists(backup_path): if os.path.exists(backup_path):
@ -313,9 +316,12 @@ class PDFPreviewer:
file_name = os.path.basename(abs_path) file_name = os.path.basename(abs_path)
backup_dir = os.path.join(dir_name, "Copies Originales") backup_dir = os.path.join(dir_name, "Copies Originales")
copies_dir = os.path.join(dir_name, "Copies")
os.makedirs(backup_dir, exist_ok=True) os.makedirs(backup_dir, exist_ok=True)
os.makedirs(copies_dir, exist_ok=True)
backup_path = os.path.join(backup_dir, file_name) backup_path = os.path.join(backup_dir, file_name)
copies_path = os.path.join(copies_dir, file_name)
# Remove backup if it already exists (overwrite) # Remove backup if it already exists (overwrite)
if os.path.exists(backup_path): if os.path.exists(backup_path):
@ -325,7 +331,7 @@ class PDFPreviewer:
shutil.move(self.pdf_path, backup_path) shutil.move(self.pdf_path, backup_path)
# Move the temp output file to replace the original # Move the temp output file to replace the original
shutil.move(self.final_file, self.pdf_path) shutil.move(self.final_file, copies_path)
# print(f"Original moved to {backup_path}, new file saved at {self.pdf_path}") # print(f"Original moved to {backup_path}, new file saved at {self.pdf_path}")

View File

@ -101,7 +101,7 @@ def worker_thread(base_dir, files_to_process, all_labels):
previous_copie = None previous_copie = None
last_label_index = None last_label_index = None
for img_path in files_to_process: for img_path in files_to_process:
json_path = base_dir / f"{img_path.stem}.json" json_path = base_dir / "Copies" / f"{img_path.stem}.json"
copie_part = int(img_path.stem[-2:]) copie_part = int(img_path.stem[-2:])
copie = img_path.stem[:-3] copie = img_path.stem[:-3]
if copie != previous_copie: if copie != previous_copie:
@ -222,7 +222,7 @@ class ImageViewer:
def save_current_batch(self): def save_current_batch(self):
"""Writes the accumulated data to the main JSON file.""" """Writes the accumulated data to the main JSON file."""
if self.active_copie_name and self.accumulated_results: if self.active_copie_name and self.accumulated_results:
main_json_path = self.base_dir / f"{self.active_copie_name}.json" main_json_path = self.base_dir / "Copies" / f"{self.active_copie_name}.json"
print(f"Writing aggregated result to {main_json_path}") print(f"Writing aggregated result to {main_json_path}")
with open(main_json_path, 'w') as f: with open(main_json_path, 'w') as f:
json.dump(self.accumulated_results, f) json.dump(self.accumulated_results, f)
@ -327,7 +327,7 @@ class ImageViewer:
def on_open_ori_pdf(self, event): def on_open_ori_pdf(self, event):
if self.is_viewing and self.current_json_path: if self.is_viewing and self.current_json_path:
new_filename = self.current_json_path.stem.split('_')[0] + ".pdf" new_filename = self.current_json_path.stem.split('_')[0] + ".pdf"
pdf_path = self.current_json_path.parent / "Copies Originales" / new_filename pdf_path = self.base_dir / "Copies Originales" / new_filename
print(f"Opening {pdf_path}") print(f"Opening {pdf_path}")
subprocess.Popen(['xdg-open', str(pdf_path.absolute())]) subprocess.Popen(['xdg-open', str(pdf_path.absolute())])
@ -363,20 +363,21 @@ if __name__ == "__main__":
files_to_process = [] files_to_process = []
if input_path.is_file(): if input_path.is_file():
# Correctly identify base_dir if we are in 'Copies' or 'Cutleft'
base_dir = input_path.parent if input_path.parent.name in ["Copies", "Cutleft"]:
stem = input_path.stem
img_path = base_dir / "Cutleft" / f"{stem}.jpg"
files_to_process = [img_path]
if not img_path.exists() and input_path.parent.name == "Cutleft":
base_dir = input_path.parent.parent base_dir = input_path.parent.parent
img_path = input_path else:
files_to_process = [img_path] base_dir = input_path.parent
if not img_path.exists():
# We're given Copie01.pdf, look for parts stem = input_path.stem
cutleft_dir = base_dir / "Cutleft" cutleft_dir = base_dir / "Cutleft"
files_to_process = sorted(list(cutleft_dir.glob(f"{img_path.stem}_*.jpg")), img_path = cutleft_dir / f"{stem}.jpg"
key=natural_key)
if img_path.exists():
files_to_process = [img_path]
else:
# We're given something like Copie01.pdf, look for its split image parts
files_to_process = sorted(list(cutleft_dir.glob(f"{stem}_*.jpg")), key=natural_key)
else: else:
base_dir = input_path base_dir = input_path
cutleft_dir = base_dir / "Cutleft" cutleft_dir = base_dir / "Cutleft"

View File

@ -3,6 +3,7 @@ import os
import json import json
import numpy as np import numpy as np
import shutil import shutil
from pathlib import Path
from PIL import Image, ImageChops, ImageFilter from PIL import Image, ImageChops, ImageFilter
Image.MAX_IMAGE_PIXELS = None Image.MAX_IMAGE_PIXELS = None
from pdf2image import convert_from_path from pdf2image import convert_from_path
@ -99,7 +100,7 @@ def detect_checks_and_notes(output_dir):
density = changed_pixels / roi.size density = changed_pixels / roi.size
if density > DENSITY_THRESHOLD: if density > DENSITY_THRESHOLD:
print("A checked box !", density, b) # print("A checked box !", density, b)
actions.append(box) actions.append(box)
# It's checked, so we mask this area out for manual notes # It's checked, so we mask this area out for manual notes
# Expand mask slightly to catch sloppy ticks # Expand mask slightly to catch sloppy ticks
@ -254,7 +255,7 @@ def apply_actions_and_regenerate(root_dir, data, student_id, actions, notes_laye
# B. Regenerate Label Image # B. Regenerate Label Image
# We always regenerate to ensure Concat.jpg is consistent with any modifications # We always regenerate to ensure Concat.jpg is consistent with any modifications
pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf") pdf_path = Path(root_dir) / "Copies" / f"Copie{student_id}" / f"{label}.pdf"
if not os.path.exists(pdf_path): continue if not os.path.exists(pdf_path): continue
(base_img, _, _) = annotating.make_base_image(pdf_path) (base_img, _, _) = annotating.make_base_image(pdf_path)
@ -328,7 +329,6 @@ def apply_actions_and_regenerate(root_dir, data, student_id, actions, notes_laye
full_img.save(os.path.join(output_dir, "Concat_F.jpg")) full_img.save(os.path.join(output_dir, "Concat_F.jpg"))
print(f" Saved regenerated Concat_F.jpg") print(f" Saved regenerated Concat_F.jpg")
from pathlib import Path
from utils import read_all_labels from utils import read_all_labels
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) < 2: if len(sys.argv) < 2:

View File

@ -85,7 +85,8 @@ def save_paginated_pdf(image_groups, output_path):
if pages: if pages:
pages[0].save(output_path, "PDF", resolution=100.0, save_all=True, append_images=pages[1:]) pages[0].save(output_path, "PDF", resolution=100.0, save_all=True, append_images=pages[1:])
def apply_actions_and_regenerate_grouped(root_dir, data, student_id, actions, label_notes, all_labels): def apply_actions_and_regenerate_grouped(root_dir, data, student_id,
actions, label_notes, all_labels):
""" """
Modifies data based on actions, pastes label-specific note crops, Modifies data based on actions, pastes label-specific note crops,
regenerates label images for consistency, saves dirty ones, regenerates label images for consistency, saves dirty ones,
@ -161,7 +162,7 @@ def apply_actions_and_regenerate_grouped(root_dir, data, student_id, actions, la
result = content['result'] result = content['result']
d_notes[label] = str(result.get('score', 0)) d_notes[label] = str(result.get('score', 0))
pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf") pdf_path = Path(root_dir) / "Copies" / f"Copie{student_id}" / f"{label}.pdf"
if not os.path.exists(pdf_path): continue if not os.path.exists(pdf_path): continue
(base_img, _, _) = annotating.make_base_image(pdf_path) (base_img, _, _) = annotating.make_base_image(pdf_path)
@ -204,13 +205,15 @@ def apply_actions_and_regenerate_grouped(root_dir, data, student_id, actions, la
concat_list.append(final_img) concat_list.append(final_img)
perfect_no_comment = True perfect_no_comment = True
if float(d_notes[label]) != 4.0: if float(d_notes[label]) < 4.0:
perfect_no_comment = False perfect_no_comment = False
else: else:
if len(result.get('feedback', [])) != 0: lfb = result.get('feedback', [])
for e in lfb:
if "to_delete" not in e or not e["to_delete"]:
perfect_no_comment = False perfect_no_comment = False
if not perfect_no_comment: if not perfect_no_comment or has_notes:
extras = get_extra_pdfs_as_images(root_dir, label, annotating) extras = get_extra_pdfs_as_images(root_dir, label, annotating)
extras.append(final_img) extras.append(final_img)
concat_list_F.append(extras) concat_list_F.append(extras)
@ -333,7 +336,8 @@ if __name__ == "__main__":
if hmax > hmin: if hmax > hmin:
crop = notes_img.crop((0, hmin, notes_img.width, hmax)) crop = notes_img.crop((0, hmin, notes_img.width, hmax))
if has_significant_notes(crop): if has_significant_notes(crop):
notes_by_student[sid][lbl] = {'img': crop, 'old_header_h': img_info.get("header_height", 0)} notes_by_student[sid][lbl] = {'img': crop,
'old_header_h': img_info.get("header_height", 0)}
def process_refaire_entry(sid, r_labels): def process_refaire_entry(sid, r_labels):
@ -364,7 +368,9 @@ if __name__ == "__main__":
if hmax > hmin: if hmax > hmin:
crop = b_notes_img.crop((0, hmin, b_notes_img.width, hmax)) crop = b_notes_img.crop((0, hmin, b_notes_img.width, hmax))
if has_significant_notes(crop): if has_significant_notes(crop):
notes_by_student[sid][lbl] = {'img': crop, 'old_header_h': img_info.get("header_height", 0)} notes_by_student[sid][lbl] = \
{'img': crop,
'old_header_h': img_info.get("header_height", 0)}

View File

@ -37,7 +37,7 @@ def decode_json(pdf_file):
def split_an_interro(base_dir, input_pdf, coords_list): def split_an_interro(base_dir, input_pdf, coords_list):
doc = fitz.open(input_pdf) doc = fitz.open(input_pdf)
output_dir = base_dir / input_pdf.stem output_dir = base_dir / "Copies" / input_pdf.stem
generated_files = set() generated_files = set()
parts_by_label = defaultdict(list) parts_by_label = defaultdict(list)
@ -197,10 +197,13 @@ if __name__ == "__main__":
if input_arg.is_file(): if input_arg.is_file():
base_dir = input_arg.parent base_dir = input_arg.parent
if base_dir.name == "Copies":
base_dir = base_dir.parent
pdf_files = [input_arg] pdf_files = [input_arg]
elif input_arg.is_dir(): elif input_arg.is_dir():
base_dir = input_arg base_dir = input_arg
pdf_files = sorted(base_dir.glob("*.pdf")) copies_dir = base_dir / "Copies"
pdf_files = sorted(copies_dir.glob("*.pdf"))
else: else:
print(f"Error: {input_arg} is not a valid file or directory.") print(f"Error: {input_arg} is not a valid file or directory.")
sys.exit(1) sys.exit(1)

View File

@ -5,9 +5,7 @@ def natural_key(text):
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))] return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
def read_all_labels(base_dir): def read_all_labels(base_dir):
return sorted(list(filter(None, return list(filter(None, (Path(base_dir) / "labels").read_text().splitlines()))
(Path(base_dir) / "labels").read_text().splitlines())),
key = natural_key)
def enonce_total(base_dir): def enonce_total(base_dir):
text_dir = Path(base_dir) / 'Text' text_dir = Path(base_dir) / 'Text'