Compare commits

...

2 Commits

9 changed files with 140 additions and 90 deletions

View File

@ -1,7 +1,7 @@
#+title: Script #+title: Script
#+author: Sébastien Miquel #+author: Sébastien Miquel
#+date: 14-03-2026 #+date: 14-03-2026
# Time-stamp: <07-05-26 11:33> # Time-stamp: <08-05-26 22:52>
#+OPTIONS: #+OPTIONS:
* Quézaco * Quézaco
@ -106,7 +106,13 @@ export GEMINI_API_KEY=…
labels des exercices/questions. labels des exercices/questions.
Rerun on a single file with =python cutleft.py Interro/Copie01.pdf= Rerun on a single file with =python cutleft.py Interro/Copie01.pdf=
5. =python enonce_info.py Interro= (gestion perso)
* Génération d'information sur l'énoncé
1. =python enonce_info.py Interro= (gestion perso)
OU
2. =python gemini_for_enonce.py Interro=
+ Nécessite =enonce.tex/org= et `correction.tex/org`
* Labelisation et regroupement * Labelisation et regroupement
@ -124,7 +130,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
+ Quand un label est manquant, il est possible de cliquer sur + Quand un label est manquant, il est possible de cliquer sur
l'image, ce qui copie les coordonnées dans le presse papier l'image, ce qui copie les coordonnées dans le presse papier
(sous linux…), puis on peut l'ajouter à la main. (sous linux…), puis on peut l'ajouter à la main.
+ Utilisation de `_`, `|…` et `…|`
Pour modifier une seule copie : Pour modifier une seule copie :
=python plotting.py Interro/Copie01.pdf= =python plotting.py Interro/Copie01.pdf=
@ -151,6 +157,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
2. =python correction.py Interro --limit 240= OU 2. =python correction.py Interro --limit 240= OU
=python correction.py Interro/Ex\ 2/Group_1.jpg= OU =python correction.py Interro/Ex\ 2/Group_1.jpg= OU
=python correction.py Interro --overwrite= =python correction.py Interro --overwrite=
=python correction.py Interro --pro-by-label= (needs `labels_for_pro`)
Fais les requêtes de correction à Gemini. Fais les requêtes de correction à Gemini.

View File

@ -396,6 +396,8 @@ def render_score_text(label, score, error, width_px, fontsize=30,
return img return img
A4_WIDTH_200DPI = 1654
TARGET_MIN_WIDTH = int(A4_WIDTH_200DPI * 0.9) # 1406 pixels
def compose_label_image(base_img, label, result, hmin, def compose_label_image(base_img, label, result, hmin,
render_fn=render_real_latex_text, render_fn=render_real_latex_text,
draw_callback=None, draw_callback=None,
@ -415,6 +417,17 @@ def compose_label_image(base_img, label, result, hmin,
draw_callback: Optional function(type, draw_obj, position_dict, data_dict) draw_callback: Optional function(type, draw_obj, position_dict, data_dict)
called when elements are placed. Used for checkboxes. called when elements are placed. Used for checkboxes.
""" """
left_pad = 0
if base_img.width < TARGET_MIN_WIDTH:
total_missing = TARGET_MIN_WIDTH - base_img.width
left_pad = min(total_missing, MARGIN_LEFT)
right_pad = total_missing - left_pad
new_base = Image.new("RGB", (TARGET_MIN_WIDTH, base_img.height), "white")
new_base.paste(base_img, (left_pad, 0))
base_img = new_base
score = result.get('score', 0) score = result.get('score', 0)
error = result.get('error', "") error = result.get('error', "")
feedbacks = result.get('feedback', []) feedbacks = result.get('feedback', [])
@ -485,8 +498,8 @@ def compose_label_image(base_img, label, result, hmin,
target_ymin = (ymin - hmin) + image_offset_y target_ymin = (ymin - hmin) + image_offset_y
target_ymax = (ymax - hmin) + image_offset_y target_ymax = (ymax - hmin) + image_offset_y
target_xmin = xmin + MARGIN_LEFT target_xmin = xmin + MARGIN_LEFT + left_pad
target_xmax = xmax + MARGIN_LEFT target_xmax = xmax + MARGIN_LEFT + left_pad
# Draw Rectangle (if not suppressed) # Draw Rectangle (if not suppressed)
if "norectangle" not in fb: if "norectangle" not in fb:

View File

@ -582,7 +582,7 @@ Here is a list of all possible labels. You need to answer with a list one of the
height = grouping.get_pdf_height(str(new_pdf_path)) height = grouping.get_pdf_height(str(new_pdf_path))
grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR) grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR)
new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"), new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"),
add_label, False, f"{label}(->)")) add_label, False))
error += f"(->){add_label}" error += f"(->){add_label}"
keep_error = True keep_error = True
else: else:
@ -603,7 +603,6 @@ def process_single_task(task_tuple, precomputed_response=None):
file_path = task_tuple[0] file_path = task_tuple[0]
label = task_tuple[1] label = task_tuple[1]
can_spawn_tasks = task_tuple[2] if len(task_tuple) > 2 else True can_spawn_tasks = task_tuple[2] if len(task_tuple) > 2 else True
injected_error = task_tuple[3] if len(task_tuple) > 3 else ""
group_name = os.path.splitext(file_path)[0] group_name = os.path.splitext(file_path)[0]
json_path = group_name + '.json' json_path = group_name + '.json'
@ -649,15 +648,6 @@ def process_single_task(task_tuple, precomputed_response=None):
for p in json_data: for p in json_data:
pid = p["id"] pid = p["id"]
res = p["result"] res = p["result"]
# Inject additional error if present
if injected_error:
if res["error"]:
res["error"] = f"{injected_error} {res['error']}"
else:
res["error"] = injected_error
yming, ymaxg, width_r = d_data[pid] yming, ymaxg, width_r = d_data[pid]
pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf" pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf"
@ -720,8 +710,6 @@ def process_single_task(task_tuple, precomputed_response=None):
tprint(f"Error decoding JSON for {file_path}", file=sys.stderr) tprint(f"Error decoding JSON for {file_path}", file=sys.stderr)
except Exception as e: except Exception as e:
error_msg = f"Exception processing {file_path}: {e}" error_msg = f"Exception processing {file_path}: {e}"
import traceback
traceback.print_exc() # <--- Add this line to see the real crash
print(error_msg, file=sys.stderr) print(error_msg, file=sys.stderr)
with io_lock: with io_lock:
errors_summary.append((error_msg, file_path)) errors_summary.append((error_msg, file_path))

View File

@ -26,7 +26,8 @@ if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'):
files = [os.path.basename(path_arg)] files = [os.path.basename(path_arg)]
elif os.path.isdir(path_arg): elif os.path.isdir(path_arg):
INPUT_DIR = path_arg INPUT_DIR = path_arg
files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf')]) files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf') and
"nonc" not in f.lower()])
else: else:
sys.exit("Error: Input must be a directory or a PDF file.") sys.exit("Error: Input must be a directory or a PDF file.")
@ -83,12 +84,20 @@ def stitch_images(image_list):
return combined return combined
import threading
pdf_cache_lock = threading.Lock()
@lru_cache(maxsize=3) @lru_cache(maxsize=3)
def get_pdf_pages(filename): def _get_pdf_pages_cached(filename):
"""Caches the heavy PDF rendering step for the current and next files."""
pdf_path = os.path.join(INPUT_DIR, filename) pdf_path = os.path.join(INPUT_DIR, filename)
return convert_from_path(pdf_path) return convert_from_path(pdf_path)
def get_pdf_pages(filename):
"""Thread-safe wrapper for the cached PDF conversion."""
with pdf_cache_lock:
return _get_pdf_pages_cached(filename)
def process_single_pdf(filename, shift_offset=0, max_per_file=5): def process_single_pdf(filename, shift_offset=0, max_per_file=5):
""" """
Converts PDF to stitched images. Converts PDF to stitched images.
@ -137,7 +146,8 @@ def process_single_pdf(filename, shift_offset=0, max_per_file=5):
# 3. Generate Preview (All stitched together, Resized) # 3. Generate Preview (All stitched together, Resized)
full_stitch = stitch_images(cropped_images) full_stitch = stitch_images(cropped_images)
preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.LANCZOS) # preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.LANCZOS)
preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.BILINEAR)
schema = { schema = {
"original_filename": filename, "original_filename": filename,
@ -200,8 +210,6 @@ class ImageReviewer:
self.current_preview = None # Only stores the resized preview for GUI self.current_preview = None # Only stores the resized preview for GUI
self.is_processing = False self.is_processing = False
# Queue for pre-fetched results (index, (preview, splits, schema))
self.prefetch_queue = Queue(maxsize=1)
# Queue for manual re-processing results # Queue for manual re-processing results
self.manual_queue = Queue() self.manual_queue = Queue()
@ -244,19 +252,15 @@ class ImageReviewer:
self.trigger_processing(self.files[self.index], self.current_shift) self.trigger_processing(self.files[self.index], self.current_shift)
def prefetch_worker(self): def prefetch_worker(self):
"""Background thread to process the NEXT image constantly.""" """Background thread to load the NEXT file's PDF pages into RAM."""
idx_to_process = 0 idx_to_process = -1
while True: while True:
target = self.index + 1 target = self.index + 1
if target < len(self.files): if target < len(self.files) and target != idx_to_process:
if idx_to_process != target:
fname = self.files[target] fname = self.files[target]
result = process_single_pdf(fname, shift_offset=0) get_pdf_pages(fname) # Just calling it warms the lru_cache
if result:
self.prefetch_queue.put((target, result)) # Blocks if full
idx_to_process = target idx_to_process = target
time.sleep(0.05)
time.sleep(0.1)
def load_current_image(self, use_prefetch=False): def load_current_image(self, use_prefetch=False):
if self.index >= len(self.files): if self.index >= len(self.files):
@ -266,20 +270,9 @@ class ImageReviewer:
filename = self.files[self.index] filename = self.files[self.index]
self.is_processing = False self.is_processing = False
result_found = None
if use_prefetch and not self.prefetch_queue.empty():
q_idx, q_result = self.prefetch_queue.queue[0]
if q_idx == self.index:
_, result_found = self.prefetch_queue.get()
self.current_shift = 0 self.current_shift = 0
print(f"Loaded {filename} from prefetch.")
if result_found: # Always trigger processing. If prefetched, get_pdf_pages returns instantly.
self.handle_processing_result(result_found, filename)
else:
# Not in queue (first load or queue mismatch), process manually
self.trigger_processing(filename, self.current_shift) self.trigger_processing(filename, self.current_shift)
def trigger_processing(self, filename, shift): def trigger_processing(self, filename, shift):

View File

@ -314,7 +314,7 @@ def process_copy_group(group_key, files):
# Run ThreadPool on GROUPS (Copies), not individual files # Run ThreadPool on GROUPS (Copies), not individual files
# Each thread handles one student's full exam copy sequentially # Each thread handles one student's full exam copy sequentially
with ThreadPoolExecutor(max_workers=16) as executor: with ThreadPoolExecutor(max_workers=12) as executor:
# Convert dict items to arguments for map # Convert dict items to arguments for map
# executor.map expects a function and an iterable. # executor.map expects a function and an iterable.
# We use a lambda or separate function to unpack the tuple if needed, # We use a lambda or separate function to unpack the tuple if needed,

View File

@ -7,6 +7,7 @@ import os
import re import re
import glob import glob
import shutil import shutil
import subprocess
from pypdf import PdfReader, PdfWriter from pypdf import PdfReader, PdfWriter
# --- Constants --- # --- Constants ---
@ -94,7 +95,7 @@ class PDFPreviewer:
"← / → : Move line 1cm left/right\n" "← / → : Move line 1cm left/right\n"
"'c': Rotate page 180°, 'C' : rotate all pages, ',' : rotate all files\n" "'c': Rotate page 180°, 'C' : rotate all pages, ',' : rotate all files\n"
"t s r n m: keep left, next page, keep none, keep right, keep as is\n" "t s r n m: keep left, next page, keep none, keep right, keep as is\n"
"z: send this page to the end, 'R':restart file, 'P':back to previous file\n" "z: send this page to the end, 'A':pdf arranger 'R':restart file, 'P':back to previous file\n"
) )
self.info_label = tk.Label(master, text=instructions, justify=tk.LEFT) self.info_label = tk.Label(master, text=instructions, justify=tk.LEFT)
self.info_label.pack(pady=5, side=tk.TOP) self.info_label.pack(pady=5, side=tk.TOP)
@ -123,6 +124,7 @@ class PDFPreviewer:
self.master.bind("r", self.discard_page) self.master.bind("r", self.discard_page)
self.master.bind("z", self.send_page_end) self.master.bind("z", self.send_page_end)
self.master.bind("R", self.restart_current_file) self.master.bind("R", self.restart_current_file)
self.master.bind("A", self.start_arranger)
self.master.bind("P", self.go_to_previous_file) self.master.bind("P", self.go_to_previous_file)
@ -131,6 +133,9 @@ class PDFPreviewer:
self.current_zoom = 1.0 self.current_zoom = 1.0
def start_arranger(self):
subprocess.Popen(["pdf-arranger", self.pdf_path])
def on_resize(self, event): def on_resize(self, event):
""" """
Handles window resize events by reloading the page. Handles window resize events by reloading the page.

View File

@ -309,7 +309,18 @@ class ImageViewer:
def on_open_interro(self, event): def on_open_interro(self, event):
if self.is_viewing and self.current_json_path: if self.is_viewing and self.current_json_path:
pdf_path = "/home/sebastien/Prépa/Staging/Interro/" + str(base_dir) + ".pdf" # Check local directory first
local_accent = self.base_dir / "énoncé.pdf"
local_plain = self.base_dir / "enonce.pdf"
if local_accent.exists():
pdf_path = str(local_accent)
elif local_plain.exists():
pdf_path = str(local_plain)
else:
# Fallback to the Interro staging directory
pdf_path = f"/home/sebastien/Prépa/Staging/Interro/{self.base_dir.name}.pdf"
print(f"Opening {pdf_path}") print(f"Opening {pdf_path}")
subprocess.Popen(['xdg-open', pdf_path]) subprocess.Popen(['xdg-open', pdf_path])

View File

@ -27,7 +27,7 @@ def decode_json(pdf_file):
(b, label) = d["box_2d"], d["label"] (b, label) = d["box_2d"], d["label"]
pn = page_number(b) pn = page_number(b)
carreau = 1000 // 38 carreau = 1000 // 38
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau))) result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau), b[1], b[3]))
result.sort(key=lambda x: (x[1], x[2])) result.sort(key=lambda x: (x[1], x[2]))
return (name, result) return (name, result)
@ -39,11 +39,23 @@ def split_an_interro(base_dir, input_pdf, coords_list):
generated_files = set() generated_files = set()
parts_by_label = defaultdict(list) parts_by_label = defaultdict(list)
# Filter consecutive duplicate labels # 1. Parse labels to strip '|' and determine type: L (Left), R (Right), N (Normal)
parsed_coords = []
for item in coords_list:
label, pn, y0, y1, x0, x1 = item
if label.startswith("|"):
c_type, clean_label = "L", label[1:]
elif label.endswith("|"):
c_type, clean_label = "R", label[:-1]
else:
c_type, clean_label = "N", label
parsed_coords.append((clean_label, c_type, pn, y0, y1, x0, x1))
# 2. Filter consecutive duplicate labels based on the cleaned name
filtered_coords = [] filtered_coords = []
if coords_list: if parsed_coords:
filtered_coords.append(coords_list[0]) filtered_coords.append(parsed_coords[0])
for item in coords_list[1:]: for item in parsed_coords[1:]:
if item[0] != filtered_coords[-1][0]: if item[0] != filtered_coords[-1][0]:
filtered_coords.append(item) filtered_coords.append(item)
coords_list = filtered_coords coords_list = filtered_coords
@ -53,11 +65,11 @@ def split_an_interro(base_dir, input_pdf, coords_list):
page_height = page.rect.height page_height = page.rect.height
return (y / 1000) * page_height return (y / 1000) * page_height
def save_cropped_page(doc, page_num, y0, y1, out_path): def save_cropped_page(doc, page_num, x0, y0, x1, y1, out_path):
"""Saves a cropped portion of a page as a new PDF.""" """Saves a cropped portion of a page as a new PDF."""
page = doc[page_num] page = doc[page_num]
rotated_rect = page.rect * page.transformation_matrix rotated_rect = page.rect * page.transformation_matrix
visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1) visual_crop_rect = fitz.Rect(rotated_rect.x0 + x0, y0, rotated_rect.x0 + x1, y1)
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
temp_doc = fitz.open() temp_doc = fitz.open()
@ -76,46 +88,70 @@ def split_an_interro(base_dir, input_pdf, coords_list):
temp_doc.close() temp_doc.close()
# Iterate through all labels # Iterate through all labels
for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list): for idx, (clean_label, c_type, start_page, y_start_raw, y_end_box, x0_raw, x1_raw) in enumerate(coords_list):
if clean_label == "_":
continue
temp_parts = [] temp_parts = []
# Determine the stopping point for this label
if idx + 1 < len(coords_list):
# Normal case: stop at the next label
_, end_page, _, y_end_raw = coords_list[idx + 1]
end_y_target_raw = y_end_raw
else:
# FIX BUG 1: Last label extends to the very end of the document
end_page = doc.page_count - 1 end_page = doc.page_count - 1
end_y_target_raw = 1000 # 1000 represents full height end_y_target_raw = 1000
# RULE 2: Determine stopping label
for next_item in coords_list[idx + 1:]:
n_clean, n_type, n_pn, n_y_start, _, _, _ = next_item
if c_type == "L":
is_stop = (n_type in ("L", "N"))
elif c_type == "R":
is_stop = (n_type in ("R", "N"))
else:
is_stop = True # Normal labels stop at anything
if is_stop:
end_page = n_pn
end_y_target_raw = n_y_start
break
# RULES 3 & 4: Calculate horizontal boundaries (0.0 to 1.0 fraction of local page width)
col_w = 1000 / doc.page_count
if c_type == "L": # |name
fraction_x0 = (x0_raw % col_w) / col_w
fraction_x1 = 1.0
end_y_target_raw = min(1000, end_y_target_raw + 40)
elif c_type == "R": # name|
fraction_x0 = 0.0
# Find the closest 'L' label in y-distance
L_labels = [it for it in parsed_coords if it[1] == "L"]
if L_labels:
closest_L = min(L_labels, key=lambda it: abs(it[3] - y_start_raw))
closest_L_x_center = (closest_L[5] + closest_L[6]) / 2.0
fraction_x1 = (closest_L_x_center % col_w) / col_w
if fraction_x1 <= fraction_x0: fraction_x1 = 1.0 # Fallback
else:
fraction_x1 = 1.0
else: # Normal
fraction_x0 = 0.0
fraction_x1 = 1.0
# FIX BUG 2: Iterate through EVERY page from start to end
# This handles cases where start_page == end_page, start_page + 1 == end_page,
# AND start_page + N == end_page (gaps)
current_p = start_page current_p = start_page
while current_p <= end_page: while current_p <= end_page:
page = doc[current_p]
# Determine Top Cut (y0) y0 = scale_coord(y_start_raw, page) if current_p == start_page else 0
if current_p == start_page: y1 = scale_coord(end_y_target_raw, page) if current_p == end_page else page.rect.height
y0 = scale_coord(y_start_raw, doc[current_p])
else:
y0 = 0 # Start from top of page for intermediate/last pages
# Determine Bottom Cut (y1)
if current_p == end_page:
y1 = scale_coord(end_y_target_raw, doc[current_p])
else:
y1 = doc[current_p].rect.height # Go to bottom of intermediate pages
# Only save if the slice has height (avoid empty files)
if y1 > y0 + 1: if y1 > y0 + 1:
# Convert fractions to absolute PDF points
x0_pdf = fraction_x0 * page.rect.width
x1_pdf = fraction_x1 * page.rect.width
temp_path = f"_part_{idx}_{current_p}.pdf" temp_path = f"_part_{idx}_{current_p}.pdf"
save_cropped_page(doc, current_p, y0, y1, temp_path) save_cropped_page(doc, current_p, x0_pdf, y0, x1_pdf, y1, temp_path)
temp_parts.append(temp_path) temp_parts.append(temp_path)
current_p += 1 current_p += 1
parts_by_label[title].extend(temp_parts) parts_by_label[clean_label].extend(temp_parts)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)

View File

@ -14,10 +14,7 @@ def enonce_total(base_dir):
if not text_dir.is_dir(): if not text_dir.is_dir():
return "" return ""
# Exclude .tex and .pdf files files = [f for f in text_dir.iterdir() if f.is_file()]
files = [f for f in text_dir.iterdir()
if f.is_file() and f.suffix.lower() not in ('.tex', '.pdf')]
files.sort(key=lambda f: natural_key(f.name)) files.sort(key=lambda f: natural_key(f.name))
output = [] output = []