Divers, en lien avec DMI04 (smaller images, link to enonce.pdf)

Initial support for "_", "|_" et "…|" in label names
2026-05-09 16:25:08 +02:00 · 2026-05-08 22:20:54 +02:00
9 changed files with 140 additions and 90 deletions
--- a/Readme.org
+++ b/Readme.org
@ -1,7 +1,7 @@
 #+title:  Script
 #+author: Sébastien Miquel
 #+date:   14-03-2026
-# Time-stamp: <07-05-26 11:33>
+# Time-stamp: <08-05-26 22:52>
 #+OPTIONS:

 * Quézaco
@ -106,7 +106,13 @@ export GEMINI_API_KEY=…
    labels des exercices/questions.

    Rerun on a single file with =python cutleft.py Interro/Copie01.pdf=
- 5. =python enonce_info.py Interro= (gestion perso)
+
+* Génération d'information sur l'énoncé
+
+ 1. =python enonce_info.py Interro= (gestion perso)
+OU
+ 2. =python gemini_for_enonce.py Interro=
+    + Nécessite =enonce.tex/org= et `correction.tex/org`

 * Labelisation et regroupement

@ -124,7 +130,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
    + Quand un label est manquant, il est possible de cliquer sur
      l'image, ce qui copie les coordonnées dans le presse papier
      (sous linux…), puis on peut l'ajouter à la main.
-
+    + Utilisation de `_`, `|…` et `…|`
    Pour modifier une seule copie :
 =python plotting.py Interro/Copie01.pdf=

@ -151,6 +157,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
 2. =python correction.py Interro --limit 240= OU
 =python correction.py Interro/Ex\ 2/Group_1.jpg= OU
 =python correction.py Interro --overwrite=
+ =python correction.py Interro --pro-by-label= (needs `labels_for_pro`)

    Fais les requêtes de correction à Gemini.

--- a/annotating.py
+++ b/annotating.py
@ -396,6 +396,8 @@ def render_score_text(label, score, error, width_px, fontsize=30,

    return img

+A4_WIDTH_200DPI = 1654
+TARGET_MIN_WIDTH = int(A4_WIDTH_200DPI * 0.9) # 1406 pixels
 def compose_label_image(base_img, label, result, hmin,
                        render_fn=render_real_latex_text,
                        draw_callback=None,
@ -415,6 +417,17 @@ def compose_label_image(base_img, label, result, hmin,
        draw_callback: Optional function(type, draw_obj, position_dict, data_dict)
                       called when elements are placed. Used for checkboxes.
    """
+
+    left_pad = 0
+    if base_img.width < TARGET_MIN_WIDTH:
+        total_missing = TARGET_MIN_WIDTH - base_img.width
+        left_pad = min(total_missing, MARGIN_LEFT)
+        right_pad = total_missing - left_pad
+
+        new_base = Image.new("RGB", (TARGET_MIN_WIDTH, base_img.height), "white")
+        new_base.paste(base_img, (left_pad, 0))
+        base_img = new_base
+
    score = result.get('score', 0)
    error = result.get('error', "")
    feedbacks = result.get('feedback', [])
@ -485,8 +498,8 @@ def compose_label_image(base_img, label, result, hmin,

        target_ymin = (ymin - hmin) + image_offset_y
        target_ymax = (ymax - hmin) + image_offset_y
-        target_xmin = xmin + MARGIN_LEFT
-        target_xmax = xmax + MARGIN_LEFT
+        target_xmin = xmin + MARGIN_LEFT + left_pad
+        target_xmax = xmax + MARGIN_LEFT + left_pad

        # Draw Rectangle (if not suppressed)
        if "norectangle" not in fb:
--- a/correction.py
+++ b/correction.py
@ -582,7 +582,7 @@ Here is a list of all possible labels. You need to answer with a list one of the
                height = grouping.get_pdf_height(str(new_pdf_path))
                grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR)
                new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"),
-                                  add_label, False, f"{label}(->)"))
+                                  add_label, False))
                error += f"(->){add_label}"
                keep_error = True
            else:
@ -603,7 +603,6 @@ def process_single_task(task_tuple, precomputed_response=None):
        file_path = task_tuple[0]
        label = task_tuple[1]
        can_spawn_tasks = task_tuple[2] if len(task_tuple) > 2 else True
-        injected_error = task_tuple[3] if len(task_tuple) > 3 else ""

        group_name = os.path.splitext(file_path)[0]
        json_path = group_name + '.json'
@ -649,15 +648,6 @@ def process_single_task(task_tuple, precomputed_response=None):
            for p in json_data:
                pid = p["id"]
                res = p["result"]
-
-                # Inject additional error if present
-                if injected_error:
-                    if res["error"]:
-                        res["error"] = f"{injected_error} {res['error']}"
-                    else:
-                        res["error"] = injected_error
-
-
                yming, ymaxg, width_r = d_data[pid]

                pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf"
@ -720,8 +710,6 @@ def process_single_task(task_tuple, precomputed_response=None):
            tprint(f"Error decoding JSON for {file_path}", file=sys.stderr)
        except Exception as e:
            error_msg = f"Exception processing {file_path}: {e}"
-            import traceback
-            traceback.print_exc() # <--- Add this line to see the real crash
            print(error_msg, file=sys.stderr)
            with io_lock:
                errors_summary.append((error_msg, file_path))
--- a/cutleft.py
+++ b/cutleft.py
@ -26,7 +26,8 @@ if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'):
    files = [os.path.basename(path_arg)]
 elif os.path.isdir(path_arg):
    INPUT_DIR = path_arg
-    files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf')])
+    files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf') and
+                    "nonc" not in f.lower()])
 else:
    sys.exit("Error: Input must be a directory or a PDF file.")

@ -83,12 +84,20 @@ def stitch_images(image_list):

    return combined

+import threading
+pdf_cache_lock = threading.Lock()
+
+
@lru_cache(maxsize=3)
-def get_pdf_pages(filename):
-    """Caches the heavy PDF rendering step for the current and next files."""
+def _get_pdf_pages_cached(filename):
    pdf_path = os.path.join(INPUT_DIR, filename)
    return convert_from_path(pdf_path)

+def get_pdf_pages(filename):
+    """Thread-safe wrapper for the cached PDF conversion."""
+    with pdf_cache_lock:
+        return _get_pdf_pages_cached(filename)
+
 def process_single_pdf(filename, shift_offset=0, max_per_file=5):
    """
    Converts PDF to stitched images.
@ -137,7 +146,8 @@ def process_single_pdf(filename, shift_offset=0, max_per_file=5):

        # 3. Generate Preview (All stitched together, Resized)
        full_stitch = stitch_images(cropped_images)
-        preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.LANCZOS)
+        # preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.LANCZOS)
+        preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.BILINEAR)

        schema = {
            "original_filename": filename,
@ -200,8 +210,6 @@ class ImageReviewer:
        self.current_preview = None # Only stores the resized preview for GUI
        self.is_processing = False

-        # Queue for pre-fetched results (index, (preview, splits, schema))
-        self.prefetch_queue = Queue(maxsize=1)
        # Queue for manual re-processing results
        self.manual_queue = Queue()

@ -244,19 +252,15 @@ class ImageReviewer:
        self.trigger_processing(self.files[self.index], self.current_shift)

    def prefetch_worker(self):
-        """Background thread to process the NEXT image constantly."""
-        idx_to_process = 0
+        """Background thread to load the NEXT file's PDF pages into RAM."""
+        idx_to_process = -1
        while True:
            target = self.index + 1
-            if target < len(self.files):
-                if idx_to_process != target:
+            if target < len(self.files) and target != idx_to_process:
                fname = self.files[target]
-                    result = process_single_pdf(fname, shift_offset=0)
-                    if result:
-                        self.prefetch_queue.put((target, result)) # Blocks if full
+                get_pdf_pages(fname)  # Just calling it warms the lru_cache
                idx_to_process = target
-
-            time.sleep(0.1)
+            time.sleep(0.05)

    def load_current_image(self, use_prefetch=False):
        if self.index >= len(self.files):
@ -266,20 +270,9 @@ class ImageReviewer:

        filename = self.files[self.index]
        self.is_processing = False
-
-        result_found = None
-
-        if use_prefetch and not self.prefetch_queue.empty():
-            q_idx, q_result = self.prefetch_queue.queue[0]
-            if q_idx == self.index:
-                _, result_found = self.prefetch_queue.get()
        self.current_shift = 0
-                print(f"Loaded {filename} from prefetch.")

-        if result_found:
-            self.handle_processing_result(result_found, filename)
-        else:
-            # Not in queue (first load or queue mismatch), process manually
+        # Always trigger processing. If prefetched, get_pdf_pages returns instantly.
        self.trigger_processing(filename, self.current_shift)

    def trigger_processing(self, filename, shift):
--- a/gemini_for_labels.py
+++ b/gemini_for_labels.py
@ -314,7 +314,7 @@ def process_copy_group(group_key, files):

 # Run ThreadPool on GROUPS (Copies), not individual files
 # Each thread handles one student's full exam copy sequentially
-with ThreadPoolExecutor(max_workers=16) as executor:
+with ThreadPoolExecutor(max_workers=12) as executor:
    # Convert dict items to arguments for map
    # executor.map expects a function and an iterable.
    # We use a lambda or separate function to unpack the tuple if needed,
--- a/page_splitter.py
+++ b/page_splitter.py
@ -7,6 +7,7 @@ import os
 import re
 import glob
 import shutil
+import subprocess
 from pypdf import PdfReader, PdfWriter

 # --- Constants ---
@ -94,7 +95,7 @@ class PDFPreviewer:
            "← / → : Move line 1cm left/right\n"
            "'c': Rotate page 180°, 'C' : rotate all pages, ',' : rotate all files\n"
            "t s r n m: keep left, next page, keep none, keep right, keep as is\n"
-            "z: send this page to the end, 'R':restart file, 'P':back to previous file\n"
+            "z: send this page to the end, 'A':pdf arranger 'R':restart file, 'P':back to previous file\n"
        )
        self.info_label = tk.Label(master, text=instructions, justify=tk.LEFT)
        self.info_label.pack(pady=5, side=tk.TOP)
@ -123,6 +124,7 @@ class PDFPreviewer:
        self.master.bind("r", self.discard_page)
        self.master.bind("z", self.send_page_end)
        self.master.bind("R", self.restart_current_file)
+        self.master.bind("A", self.start_arranger)
        self.master.bind("P", self.go_to_previous_file)


@ -131,6 +133,9 @@ class PDFPreviewer:

        self.current_zoom = 1.0

+    def start_arranger(self):
+        subprocess.Popen(["pdf-arranger", self.pdf_path])
+
    def on_resize(self, event):
        """
        Handles window resize events by reloading the page.
--- a/plotting.py
+++ b/plotting.py
@ -309,7 +309,18 @@ class ImageViewer:

    def on_open_interro(self, event):
        if self.is_viewing and self.current_json_path:
-            pdf_path = "/home/sebastien/Prépa/Staging/Interro/" + str(base_dir) + ".pdf"
+        # Check local directory first
+            local_accent = self.base_dir / "énoncé.pdf"
+            local_plain = self.base_dir / "enonce.pdf"
+
+            if local_accent.exists():
+                pdf_path = str(local_accent)
+            elif local_plain.exists():
+                pdf_path = str(local_plain)
+            else:
+                # Fallback to the Interro staging directory
+                pdf_path = f"/home/sebastien/Prépa/Staging/Interro/{self.base_dir.name}.pdf"
+
            print(f"Opening {pdf_path}")
            subprocess.Popen(['xdg-open', pdf_path])

--- a/splitting_int.py
+++ b/splitting_int.py
@ -27,7 +27,7 @@ def decode_json(pdf_file):
        (b, label) = d["box_2d"], d["label"]
        pn = page_number(b)
        carreau = 1000 // 38
-        result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
+        result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau), b[1], b[3]))
    result.sort(key=lambda x: (x[1], x[2]))
    return (name, result)

@ -39,11 +39,23 @@ def split_an_interro(base_dir, input_pdf, coords_list):
    generated_files = set()
    parts_by_label = defaultdict(list)

-    # Filter consecutive duplicate labels
+    # 1. Parse labels to strip '|' and determine type: L (Left), R (Right), N (Normal)
+    parsed_coords = []
+    for item in coords_list:
+        label, pn, y0, y1, x0, x1 = item
+        if label.startswith("|"):
+            c_type, clean_label = "L", label[1:]
+        elif label.endswith("|"):
+            c_type, clean_label = "R", label[:-1]
+        else:
+            c_type, clean_label = "N", label
+        parsed_coords.append((clean_label, c_type, pn, y0, y1, x0, x1))
+
+    # 2. Filter consecutive duplicate labels based on the cleaned name
    filtered_coords = []
-    if coords_list:
-        filtered_coords.append(coords_list[0])
-        for item in coords_list[1:]:
+    if parsed_coords:
+        filtered_coords.append(parsed_coords[0])
+        for item in parsed_coords[1:]:
            if item[0] != filtered_coords[-1][0]:
                filtered_coords.append(item)
    coords_list = filtered_coords
@ -53,11 +65,11 @@ def split_an_interro(base_dir, input_pdf, coords_list):
        page_height = page.rect.height
        return (y / 1000) * page_height

-    def save_cropped_page(doc, page_num, y0, y1, out_path):
+    def save_cropped_page(doc, page_num, x0, y0, x1, y1, out_path):
        """Saves a cropped portion of a page as a new PDF."""
        page = doc[page_num]
        rotated_rect = page.rect * page.transformation_matrix
-        visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
+        visual_crop_rect = fitz.Rect(rotated_rect.x0 + x0, y0, rotated_rect.x0 + x1, y1)
        unrotated_clip_rect = visual_crop_rect * page.derotation_matrix

        temp_doc = fitz.open()
@ -76,46 +88,70 @@ def split_an_interro(base_dir, input_pdf, coords_list):
        temp_doc.close()

    # Iterate through all labels
-    for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list):
+    for idx, (clean_label, c_type, start_page, y_start_raw, y_end_box, x0_raw, x1_raw) in enumerate(coords_list):
+        if clean_label == "_":
+            continue
+
        temp_parts = []
-
-        # Determine the stopping point for this label
-        if idx + 1 < len(coords_list):
-            # Normal case: stop at the next label
-            _, end_page, _, y_end_raw = coords_list[idx + 1]
-            end_y_target_raw = y_end_raw
-        else:
-            # FIX BUG 1: Last label extends to the very end of the document
        end_page = doc.page_count - 1
-            end_y_target_raw = 1000  # 1000 represents full height
+        end_y_target_raw = 1000
+
+        # RULE 2: Determine stopping label
+        for next_item in coords_list[idx + 1:]:
+            n_clean, n_type, n_pn, n_y_start, _, _, _ = next_item
+
+            if c_type == "L":
+                is_stop = (n_type in ("L", "N"))
+            elif c_type == "R":
+                is_stop = (n_type in ("R", "N"))
+            else:
+                is_stop = True # Normal labels stop at anything
+
+            if is_stop:
+                end_page = n_pn
+                end_y_target_raw = n_y_start
+                break
+
+        # RULES 3 & 4: Calculate horizontal boundaries (0.0 to 1.0 fraction of local page width)
+        col_w = 1000 / doc.page_count
+        if c_type == "L": # |name
+            fraction_x0 = (x0_raw % col_w) / col_w
+            fraction_x1 = 1.0
+            end_y_target_raw = min(1000, end_y_target_raw + 40)
+        elif c_type == "R": # name|
+            fraction_x0 = 0.0
+            # Find the closest 'L' label in y-distance
+            L_labels = [it for it in parsed_coords if it[1] == "L"]
+            if L_labels:
+                closest_L = min(L_labels, key=lambda it: abs(it[3] - y_start_raw))
+                closest_L_x_center = (closest_L[5] + closest_L[6]) / 2.0
+                fraction_x1 = (closest_L_x_center % col_w) / col_w
+                if fraction_x1 <= fraction_x0: fraction_x1 = 1.0 # Fallback
+            else:
+                fraction_x1 = 1.0
+        else: # Normal
+            fraction_x0 = 0.0
+            fraction_x1 = 1.0

-        # FIX BUG 2: Iterate through EVERY page from start to end
-        # This handles cases where start_page == end_page, start_page + 1 == end_page,
-        # AND start_page + N == end_page (gaps)
        current_p = start_page
        while current_p <= end_page:
+            page = doc[current_p]

-            # Determine Top Cut (y0)
-            if current_p == start_page:
-                y0 = scale_coord(y_start_raw, doc[current_p])
-            else:
-                y0 = 0 # Start from top of page for intermediate/last pages
+            y0 = scale_coord(y_start_raw, page) if current_p == start_page else 0
+            y1 = scale_coord(end_y_target_raw, page) if current_p == end_page else page.rect.height

-            # Determine Bottom Cut (y1)
-            if current_p == end_page:
-                y1 = scale_coord(end_y_target_raw, doc[current_p])
-            else:
-                y1 = doc[current_p].rect.height # Go to bottom of intermediate pages
-
-            # Only save if the slice has height (avoid empty files)
            if y1 > y0 + 1:
+                # Convert fractions to absolute PDF points
+                x0_pdf = fraction_x0 * page.rect.width
+                x1_pdf = fraction_x1 * page.rect.width
+
                temp_path = f"_part_{idx}_{current_p}.pdf"
-                save_cropped_page(doc, current_p, y0, y1, temp_path)
+                save_cropped_page(doc, current_p, x0_pdf, y0, x1_pdf, y1, temp_path)
                temp_parts.append(temp_path)

            current_p += 1

-        parts_by_label[title].extend(temp_parts)
+        parts_by_label[clean_label].extend(temp_parts)

    output_dir.mkdir(parents=True, exist_ok=True)

--- a/utils.py
+++ b/utils.py
@ -14,10 +14,7 @@ def enonce_total(base_dir):
    if not text_dir.is_dir():
        return ""

-    # Exclude .tex and .pdf files
-    files = [f for f in text_dir.iterdir()
-             if f.is_file() and f.suffix.lower() not in ('.tex', '.pdf')]
-
+    files = [f for f in text_dir.iterdir() if f.is_file()]
    files.sort(key=lambda f: natural_key(f.name))

    output = []
Author	SHA1	Message	Date
Sébastien Miquel	a2f6f6eec0	Divers, en lien avec DMI04 (smaller images, link to enonce.pdf)	2026-05-09 16:25:08 +02:00
Sébastien Miquel	8d9165d0ac	Initial support for "_", "\|_" et "…\|" in label names	2026-05-08 22:20:54 +02:00