Initial commit

2026-01-18 14:46:54 +01:00 · 2026-01-18 14:46:54 +01:00 · 6a0c1a3958
commit 6a0c1a3958
14 changed files with 2746 additions and 0 deletions
--- a/annotating.py
+++ b/annotating.py
@ -0,0 +1,450 @@
 import sys
 import os
 import json
 import glob
 from PIL import Image
 # Results is : Copie id -> label -> {pdf_path, gemini_result, coordinates}
 # Coordinates are the real coordinates (hmin, hmax) of the image in the Group
 # The gemini_result coordinates should be un-normalized !
 def make_dictionary(root_dir):
    correction_path = os.path.join(root_dir, "correction.json")
    # Load correction data
    try:
        with open(correction_path, 'r', encoding='utf-8') as f:
            corrections = json.load(f)
    except FileNotFoundError:
        print(f"Error: {correction_path} not found.")
        sys.exit(1)
    # Dictionary: keys are IDs
    result_data = {}
    # Iterate through labels and items in correction.json
    for label, items in corrections.items():
        items = sum(items, []) # Flatten
        for item in items:
            # print(item)
            student_id = item['id']
            result_obj = item['result']
            # Find coordinates
            coordinates = None
            height,width= None, None
            label_dir = os.path.join(root_dir, label)
            # Search all json files in Dir/label
            json_files = glob.glob(os.path.join(label_dir, "*.json"))
            for jf in json_files:
                try:
                    with open(jf, 'r', encoding='utf-8') as f:
                        coord_list = json.load(f)
                        # Format: [["id", x, y], ...]
                        for entry in coord_list:
                            if entry[0] == student_id:
                                coordinates = (entry[1], entry[2])
                                img_path = os.path.splitext(jf)[0] + ".jpg"
                                with Image.open(img_path) as img:
                                    width, height = img.size
                                break
                except json.JSONDecodeError:
                    continue
                if coordinates:
                    break
            # Construct PDF path: Dir/Copie{id}/{label}.pdf
            pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf")
            # Initialize dictionary structure for this ID if missing
            if student_id not in result_data:
                result_data[student_id] = {}
            fb = result_obj.get("feedback", [])
            for i in range(len(fb)):
                el = fb[i]
                if "box_2d" in el and el["box_2d"]:
                    el["box_2d"][0] = (el["box_2d"][0] * height)//1000
                    el["box_2d"][2] = (el["box_2d"][2] * height)//1000
                    el["box_2d"][1] = (el["box_2d"][1] * width)//1000
                    el["box_2d"][3] = (el["box_2d"][3] * width)//1000
            # Populate the object
            result_data[student_id][label] = {
                "pdf_path": pdf_path,
                "result": result_obj,
                "coordinates": coordinates
            }
    return result_data
 # output the resulting dictionary
 # print(json.dumps(result_data, indent=2, ensure_ascii=False))
 import io
 import shutil
 from pdf2image import convert_from_path
 from PIL import Image, ImageDraw, ImageFont
 import matplotlib.pyplot as plt
 # plt.rcParams.update({ "text.usetex": True,
                     # "text.latex.preamble": r"\usepackage{bbold}"})
 import re
 import textwrap
 def normalize_mathtext(text):
    """
    Replaces LaTeX shortcuts not supported by Matplotlib's mathtext parser.
    e.g. \\le -> \\leq, \\ge -> \\geq
    Using lookahead (?![a-zA-Z]) prevents replacing \\left with \\leqft.
    """
    text = re.sub(r'\\le(?![a-zA-Z])', r'\\leq', text)
    text = re.sub(r'\\ge(?![a-zA-Z])', r'\\geq', text)
    text = re.sub(r'\\implies', r'\\Rightarrow', text)
    # Sometimes, Gemini escapes too much ? Not sure
    text = text.replace("\\\\", "\\")
    text = text.replace("\\llbracket", "[\\![")
    text = text.replace("\\rrbracket", "]\\!]")
    # Sometimes, Gemini doesn't escape enough. In the json, you should have \\f
    text = text.replace('\f', r'\f')
    text = re.sub('\u0010', "", text)
    return text
 import re
 def wrap_latex_text(text, width_chars):
    """
    Wraps text but keeps LaTeX math blocks ($...$) intact.
    """
    # 1. Split text into chunks of: text, math, text, math...
    # The regex looks for $...$ (non-greedy).
    parts = re.split(r'(\$[^\$]+\$)', text)
    # 2. Tokenize: Break plain text by spaces, keep math blocks whole.
    tokens = []
    for part in parts:
        if part.startswith('$') and part.endswith('$'):
            tokens.append(part) # Keep math block distinct
        else:
            tokens.extend(part.split()) # Split normal text by whitespace
    # 3. Reconstruct lines using textwrap logic
    lines = []
    current_line = []
    current_length = 0
    for token in tokens:
        # +1 for the space we will add
        token_len = len(token)
        if current_length + token_len + 1 > width_chars:
            lines.append(" ".join(current_line))
            current_line = [token]
            current_length = token_len
        else:
            current_line.append(token)
            current_length += token_len + 1
    if current_line:
        lines.append(" ".join(current_line))
    res =  "\n".join(lines)
    return res
 def render_latex_text(text, width_px, bg_color=(255, 255, 255, 255), max_lines=None,
                      fontsize=14):
    # 1. Fix unsupported symbols
    text = normalize_mathtext(text)
    dpi = 100
    fig_width = width_px / dpi
    # Estimate characters per line based on width and font size (heuristic)
    # FontSize 12 approx 0.5 inches wide for ~15 chars usually,
    # but let's approximate: Width (inches) * ~10 chars/inch for size 12
    chars_per_line = int(fig_width * 10)
    # Pre-wrap the text respecting LaTeX boundaries
    wrapped_text = wrap_latex_text(text, chars_per_line)
    # Dynamic height based on actual number of lines
    num_lines = wrapped_text.count('\n') + 1
    if max_lines and num_lines > max_lines:
        # logic to truncate if strictly necessary, or just expand
        pass
    # 0.3 inches per line buffer
    fig_height = num_lines * 0.3 + 0.2
    fig = plt.figure(figsize=(fig_width, fig_height), dpi=dpi)
    # print(wrapped_text)
    # print("\n\n")
    # NOTE: wrap=False because we did it ourselves
    plt.text(0.01, 0.95, wrapped_text, fontsize=fontsize,
             verticalalignment='top', horizontalalignment='left',
             wrap=False)
    plt.axis('off')
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.1, transparent=True)
    plt.close(fig)
    buf.seek(0)
    img = Image.open(buf).convert("RGBA")
    # Create background
    final_img = Image.new("RGBA", img.size, bg_color)
    final_img.alpha_composite(img)
    return final_img
 def process_correction(root_dir, data, all_labels):
    margin_left = 200
    for student_id, labels in data.items():
        # Prepare output directory: Dir/Anot_CopieID
        output_dir = os.path.join(root_dir, f"Anot_Copie{student_id}")
        # Check if already processed (Concat.jpg exists)
        concat_path = os.path.join(output_dir, "Concat.jpg")
        if os.path.exists(concat_path):
            print(f"Skipping Copie {student_id} (Concat.jpg exists)")
            continue
        print("Processing :", student_id)
        # Clean folder if re-processing
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)
        os.makedirs(output_dir)
        d_notes = dict.fromkeys(all_labels,"")
        for label, content in labels.items():
            # 1. Find PDF path
            copie_folder = f"Copie{student_id}"
            pdf_rel_path = os.path.join(copie_folder, f"{label}.pdf")
            pdf_full_path = os.path.join(root_dir, pdf_rel_path)
            if not os.path.exists(pdf_full_path):
                print(f"File not found: {pdf_full_path}")
                continue
            # 2. Convert PDF to Image
            try:
                pages = convert_from_path(pdf_full_path)
                # Calculate total dimensions
                total_h = sum(page.height for page in pages)
                max_w = max(page.width for page in pages)
                # Create concatenated base image
                base_img = Image.new("RGBA", (max_w, total_h), "white")
                current_y = 0
                for page in pages:
                    base_img.paste(page.convert("RGBA"), (0, current_y))
                    current_y += page.height
            except Exception as e:
                print(f"Error converting {pdf_full_path}: {e}")
                continue
            coordinates = content.get('coordinates', (0, 0)) # (hmin, hmax)
            hmin = coordinates[0]
            result = content.get('result', {})
            score = result.get('score', 0)
            error = result.get('error', "")
            feedbacks = result.get('feedback', [])
            # Organize feedbacks
            global_fb = [f for f in feedbacks if not f.get('box_2d')]
            local_fb = [f for f in feedbacks if f.get('box_2d')]
            # Sort local feedback by Y position
            local_fb.sort(key=lambda x: x['box_2d'][0])
            # --- PREPARE HEADERS ---
            header_elements = []
            score_text = f"{label}   ;   Note : {score}"
            d_notes[label] = str(score)
            if error and error != "null":
                score_text += f"   |   Error: {error}"
            # Render Row 1
            row1_img = render_latex_text(score_text, base_img.width,fontsize=18)
            header_elements.append(row1_img)
            # --- OTHER HEADERS
            # Render Global Feedbacks (Rows 2+)
            for fb in global_fb:
                fb_img = render_latex_text(fb['text'], base_img.width)
                header_elements.append(fb_img)
            # Calculate total new height
            header_height = sum(img.height for img in header_elements)
            total_height = base_img.height + header_height
            # Create Canvas
            final_img = Image.new("RGB", (base_img.width + margin_left, total_height), "white")
            # Paste Headers
            current_y = 0
            for elem in header_elements:
                final_img.paste(elem, (0, current_y))
                current_y += elem.height
            # Paste Original Image
            # Note: current_y is now the offset for the actual image content
            image_offset_y = current_y
            final_img.paste(base_img, (margin_left, image_offset_y))
            # --- DRAW LOCAL ANNOTATIONS ---
            draw = ImageDraw.Draw(final_img, "RGBA")
            last_text_bottom = 0
            for fb in local_fb:
                # raw_pos = fb.get('pos')
                box = fb.get('box_2d')
                if not box or len(box) < 4:
                    continue
                ymin, xmin, ymax, xmax = box[0], box[1], box[2], box[3]
                target_ymin = (ymin - hmin) + image_offset_y
                target_ymax = (ymax - hmin) + image_offset_y
                target_xmin = xmin + margin_left
                target_xmax = xmax + margin_left
                # Draw Rectangle
                draw.rectangle([target_xmin, target_ymin, target_xmax, target_ymax], outline="red", width=3)
                # Render Text with transparent red background
                # (255, 0, 0, 50) is transparent red
                txt_img = render_latex_text(
                    fb['text'],
                    width_px=500,
                    bg_color=(255, 200, 200, 180), # Light Red semi-transparent
                    max_lines=3
                )
                # Calculate placement
                txt_h = txt_img.height
                center_y = (target_ymin + target_ymax) / 2
                paste_y = center_y - (txt_h / 2)
                paste_y = max(paste_y, image_offset_y)
                # Prevent overlap with previous text
                if paste_y < last_text_bottom:
                    paste_y = last_text_bottom + 5 # Move down + padding
                # Check for overflow and resize if necessary
                required_height = int(paste_y + txt_h + 20)  # +20 for bottom padding
                if required_height > final_img.height:
                    # Create a new taller image
                    new_final = Image.new("RGB", (final_img.width, required_height), "white")
                    # Paste the current image content onto the new one
                    new_final.paste(final_img, (0, 0))
                    final_img = new_final
                    # Re-initialize the draw object for the new image so subsequent rectangles are drawn correctly
                    draw = ImageDraw.Draw(final_img, "RGBA")
                # Paste in the left margin
                final_img.paste(txt_img, (10, int(paste_y)), mask=txt_img)
                last_text_bottom = paste_y + txt_h
            # 7. Save Image
            save_path = os.path.join(output_dir, f"{label}.jpg")
            final_img.save(save_path)
        json_path = os.path.join(output_dir, "score.json")
        with open(json_path, "w") as f:
            json.dump(d_notes, f, indent=4)
        concat_display_image(output_dir)
 from pathlib import Path
 import subprocess
 def concat_display_image(subdir):
    subdir = Path(subdir)
    # Find valid images, excluding previous concatenations
    images = sorted([
        f for f in subdir.glob("*.jpg")
        if f.name != "Concat.jpg"
    ])
    if not images:
        return
    # Load images
    opened_imgs = [Image.open(img) for img in images]
    # Calculate dimensions (max width, sum of heights)
    max_w = max(i.width for i in opened_imgs)
    total_h = sum(i.height for i in opened_imgs)
    # Create canvas and paste vertically
    canvas = Image.new('RGB', (max_w, total_h))
    current_y = 0
    for img in opened_imgs:
        canvas.paste(img, (0, current_y))
        current_y += img.height
    # Save
    save_path = subdir / "Concat.jpg"
    canvas.save(save_path)
    print(f"Saved: {save_path}")
    # subprocess.call(('xdg-open', save_path))
 def concat_anot_images(directory):
    root = Path(directory)
    for subdir in root.iterdir():
        if subdir.is_dir() and subdir.name.startswith("Anot"):
            # Find valid images, excluding previous concatenations
            images = sorted([
                f for f in subdir.glob("*.jpg")
                if f.name != "Concat.jpg"
            ])
            if not images:
                continue
            # Load images
            opened_imgs = [Image.open(img) for img in images]
            # Calculate dimensions (max width, sum of heights)
            max_w = max(i.width for i in opened_imgs)
            total_h = sum(i.height for i in opened_imgs)
            # Create canvas and paste vertically
            canvas = Image.new('RGB', (max_w, total_h))
            current_y = 0
            for img in opened_imgs:
                canvas.paste(img, (0, current_y))
                current_y += img.height
            # Save
            save_path = subdir / "Concat.jpg"
            canvas.save(save_path)
            print(f"Saved: {save_path}")
            subprocess.call(('xdg-open', save_path))
 if len(sys.argv) < 2:
    print("Usage: python script.py <Dir>")
    sys.exit(1)
 root_dir = sys.argv[1]
 labels = list(filter(None, (Path(root_dir) / "labels").read_text().splitlines()))
 results = make_dictionary(root_dir)
 # Results is : Copie id -> label -> {pdf_path, gemini_result, coordinates}
 # Coordinates are the real coordinates (hmin, hmax) of the image in the Group
 # print(results,"\n\n\n")
 process_correction(root_dir, results, labels)
 # concat_anot_images(root_dir)
--- a/correction.py
+++ b/correction.py
@ -0,0 +1,291 @@
 import sys
 import os
 import time
 from pathlib import Path
 import argparse
 if len(sys.argv) < 2:
    sys.exit("Usage: python script.py InterroTest/Ex 2/Group_1.jpg OR <InputDir>")
 arg_path = Path(sys.argv[1])
 tasks = [] # List of tuples: (filepath_str, label_str)
 results = {}
 # Parse Arguments
 parser = argparse.ArgumentParser()
 parser.add_argument("--overwrite", action="store_true", help="Force redo requests even if output exists")
 # parse_known_args is used to avoid conflicts if run inside an environment passing other flags
 args, _ = parser.parse_known_args()
 if arg_path.suffix == ".jpg":
    # Preserve original behaviour
    INPUT_DIR = str(arg_path.parents[1])
    FULL_LABEL = arg_path.parent.name
    tasks.append((str(arg_path), FULL_LABEL))
    results[FULL_LABEL] = []
 else:
    # Directory behaviour
    INPUT_DIR = str(arg_path)
    if not arg_path.exists():
        sys.exit(f"Directory {INPUT_DIR} not found.")
    for sub in arg_path.iterdir():
        if sub.is_dir() and sub.name.startswith("Ex"):
            label = sub.name
            results[label] = []
            for img in sub.glob("*.jpg"):
                tasks.append((str(img), label))
 my_prompt = """I'm giving you an image of several written answers to an exam.
 Each answer is separated by a black horizontal line, and underneath,
 to the left, is indicated the ID of the answer, from `01` to `50`.
 I want you to score each answer, from 0 to 4, you may score half
 points, such as 2.5. Even if a result is wrong, if the reasoning is
 correct and could lead to a right answer, you should give at least
 half the points.
 You also need to give feedback to the student, in french :
 - which part of his answer is wrong,
 - why is it wrong
 - possibly, what he should have done instead.
 Your feedback may contain LaTeX fragments written like `$a^2 + b^2 = c^2$`.
 If your score is note 4, you should always provide some feedback
 explaining what's missing.
 For each piece of feedback, if it is related to a specific part of the
 answer that is wrong, you may provide a `box_2d`, to locate this
 specific part of the answer. This `box_2d` should be in the form
 [ymin, xmin, ymax, xmax] normalized to 0-1000. If you do not provide
 one, set `box_2d` to `null`.
 If the answer is correct, there is no need to provide feedback.
 For example, if the student says a function is continuous when it
 isn't, provide the coordinates where the word «continuous» is. If a
 calculation went wrong, gives the coordinates of the step where it
 goes wrong, and as feedback, what went wrong.
 You should also give me a measure of confidence, from 0 to 1 that you
 were able to correctly understand the answer. A score below 0.5 means
 that you think it is likely that you couldn't understand an important
 part.
 In some case, you may find that either
 - The student didn't answer the right question. Set the score to 0.
   Since it could be a labeling error, indicate is by setting `error`
   to \"wrong-label\".
 - You can find an answer to another question of the exercice (taking
   more than a couple of lines). Score the question you are supposed
   to score, but set `error` to \"additional-answer\".
 If there's no error, set `error` to `\"\"`.
 You will answer using json describing a list of dictionary with a key
 \"id\", and a key \"result\" that contains the \"score\", the \"confidence\", a
 list \"feedback\", and possibly an \"error\". Like this example :
 [{ \"id\": \"01\",
   \"result\": {\"score\" : 2.5,
              \"confidence\" : 0.8,
              \"feedback\": [{text: \"Un retour générique. Il faut apprendre le cours.\", box_2d: null},
              {text: \"Non, la fonction n'est pas forcément continue\", pos: [145, 280, 340, 500]}],
               \"error\": \"\"}
 },
 { \"id\": \"04\",
   \"result\": {\"score\" : 4.,
              \"confidence\" : 0.9,
              \"feedback\" : []
              \"error\": \"\" }
 }
 ]
 Here is the text of the exercice of the exam :
 ```
 <<text>>
 ```
 Here is a possible correct answer :
 ```
 <<corr>>
 ```
 Here is some additional scoring instructions :
 ```
 <<persp>>
 ```
 You are asked to score the question or exercice labeled `<<label>>`,
 do not score or give feedback to any other question."""
 def make_prompt(full_label):
    l = full_label.split(" ")
    ex_label = l[0] + " " + l[1]
    text = (Path(INPUT_DIR) / "Text" / ex_label).read_text()
    corr = (Path(INPUT_DIR) / "Sol" / ex_label).read_text()
    persp = (Path(INPUT_DIR) / "Persp" / ex_label).read_text()
    if persp == "":
        perps = "There is no additional scoring instructions."
    return my_prompt.replace("<<text>>", text).replace("<<corr>>", corr).replace("<<persp>>", persp).replace("<<label>>", full_label)
 from google import genai
 from google.genai import types
 import base64
 import json
 from pathlib import Path
 import os
 import threading
 import concurrent.futures
 # PROXY_URL = "http://192.168.241.1:3128"
 PROXY_URL = None
 if PROXY_URL:
    os.environ["http_proxy"] = PROXY_URL
    os.environ["https_proxy"] = PROXY_URL
 MODEL_ID = "gemini-3-pro-preview"
 api_key="REMOVED_API_KEY"
 from pydantic import BaseModel, Field, TypeAdapter
 from typing import List, Optional, Tuple
 class FeedbackItem(BaseModel):
    text: str = Field(description="Feedback content")
    box_2d: Optional[List[int]] = Field(None, description="box coordinates or null")
 class ResultData(BaseModel):
    score: float = Field(description="The numeric score")
    confidence: float = Field(description="Confidence level")
    feedback: List[FeedbackItem] = Field(description="List of feedback items")
    error: str = Field(description="Indicates if an error occurred")
 class EvaluationEntry(BaseModel):
    id: str = Field(description="Entry identifier")
    result: ResultData = Field(description="Result details")
 # The root model for parsing is be: List[EvaluationEntry]
 def generate_request(file, full_label):
    """Generates request for Gemini."""
    prompt = make_prompt(full_label)
    image_path = Path(file)
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_bytes(
                    data=image_path.read_bytes(),
                    mime_type="image/jpeg"
                ),
                types.Part.from_text(text=prompt),
            ],
        )
    ]
    generate_content_config = types.GenerateContentConfig(
        temperature=1.0,
        top_p=0.95,
        seed=0,
        max_output_tokens=65535,
        response_mime_type= "application/json",
        response_json_schema= TypeAdapter(List[EvaluationEntry]).json_schema()
        # Thinking config is not compatible with response_json ? Unsure.
        # thinking_config=types.ThinkingConfig(
          # thinking_budget=-1,
        # ),
        # thinking_config=types.ThinkingConfig(
            # include_thoughts=True,
            # thinking_budget=1024, # Optimized for Gemini 3 capabilities
        # ),
    )
    return (contents, generate_content_config)
 client = genai.Client(api_key=api_key)
 output_path = Path(INPUT_DIR) / "correction.json"
 progress_path = Path(INPUT_DIR) / "correction_progress.json"
 start_time = time.time()
 overwrite = args.overwrite
 completed_tasks = []
 # --- Lock for thread-safe file writing ---
 io_lock = threading.Lock()
 if overwrite:
    if output_path.exists():
        output_path.unlink()
    if progress_path.exists():
        progress_path.unlink()
 else:
    if progress_path.exists():
        with open(progress_path, "r", encoding="utf-8") as f:
            completed_tasks = json.load(f)
    # Reload existing results to avoid overwriting them with partial data
    if output_path.exists():
        with open(output_path, "r", encoding="utf-8") as f:
            results = json.load(f)
 # Create a set for O(1) lookup. Normalize paths to strings.
 completed_set = set((str(f), l) for f, l in completed_tasks)
 # Filter tasks first to avoid overhead in threads
 tasks_to_process = [t for t in tasks if (str(t[0]), t[1]) not in completed_set]
 def process_single_task(task_tuple):
    file_path, label = task_tuple
    try:
        contents, config = generate_request(file_path, label)
        print(f"Asking Gemini: {label} {file_path}")
        full_response_text = ""
        # Assuming client is thread-safe (usually is).
        # If not, create a new client instance inside this function.
        for chunk in client.models.generate_content_stream(
            model=MODEL_ID,
            contents=contents,
            config=config,
        ):
            if chunk.text:
                full_response_text += chunk.text
        # Parse JSON
        json_data = json.loads(full_response_text)
        print(f"Gemini answered correctly for {file_path}")
        # --- CRITICAL: Use Lock for writing shared data ---
        with io_lock:
            if label not in results:
                results[label] = [] # Ensure key exists if not using defaultdict
            results[label].append(json_data)
            # Save Results
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(results, f, indent=2)
            # Save Progress (Optional, based on your logic)
            # completed_tasks.append((str(file_path), label))
            # with open(progress_path, "w", encoding="utf-8") as f:
            #    json.dump(completed_tasks, f)
    except json.JSONDecodeError:
        print(f"Error decoding JSON for {file_path}", file=sys.stderr)
    except Exception as e:
        print(f"Exception processing {file_path}: {e}", file=sys.stderr)
 print(f"Starting processing on {len(tasks_to_process)} tasks with 6 threads...")
 with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    executor.map(process_single_task, tasks_to_process)
 end_time = time.time()
 print("Time elapsed : ", end_time - start_time,"\n\n\n\n\n")
--- a/cutleft.py
+++ b/cutleft.py
@ -0,0 +1,239 @@
 import sys
 import os
 import time
 import tkinter as tk
 from threading import Thread
 from queue import Queue, Empty
 from pdf2image import convert_from_path
 from PIL import Image, ImageTk
 # --- Configuration ---
 DELIMITER_WIDTH = 5
 DELIMITER_COLOR = (0, 0, 0)
 OUTPUT_SIZE = (1000, 1000)
 if len(sys.argv) < 2:
    sys.exit("Usage: python script.py <directory_path_or_file_path>")
 path_arg = sys.argv[1]
 files = []
 INPUT_DIR = ""
 if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'):
    INPUT_DIR = os.path.dirname(path_arg)
    files = [os.path.basename(path_arg)]
 elif os.path.isdir(path_arg):
    INPUT_DIR = path_arg
    files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf')])
 else:
    sys.exit("Error: Input must be a directory or a PDF file.")
 OUTPUT_DIR = os.path.join(INPUT_DIR, 'Cutleft')
 if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
 # --- Processing Logic ---
 def process_single_pdf(filename, shift_offset=0):
    """
    Converts PDF to stitched JPG image (PIL object).
    """
    pdf_path = os.path.join(INPUT_DIR, filename)
    try:
        pages = convert_from_path(pdf_path)
        cropped_images = []
        for img in pages:
            width, height = img.size
            left = 100 + shift_offset
            right = (width // 3) + 100 + shift_offset
            # Ensure crop box is valid
            left = max(0, left)
            right = min(width, right)
            if right > left:
                crop_box = (left, 0, right, height)
                cropped = img.crop(crop_box)
                cropped_images.append(cropped)
        if not cropped_images:
            return None
        # Combine
        num_images = len(cropped_images)
        total_width = sum(img.width for img in cropped_images) + (num_images - 1) * DELIMITER_WIDTH
        max_height = max(img.height for img in cropped_images)
        combined = Image.new('RGB', (total_width, max_height), color=(255, 255, 255))
        x_offset = 0
        for idx, img in enumerate(cropped_images):
            combined.paste(img, (x_offset, 0))
            x_offset += img.width
            if idx < num_images - 1:
                delimiter = Image.new('RGB', (DELIMITER_WIDTH, max_height), color=DELIMITER_COLOR)
                combined.paste(delimiter, (x_offset, 0))
                x_offset += DELIMITER_WIDTH
        # Resize
        resized = combined.resize(OUTPUT_SIZE, Image.LANCZOS)
        return resized
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        return None
 def save_image(pil_img, filename):
    output_filename = os.path.splitext(filename)[0] + ".jpg"
    output_path = os.path.join(OUTPUT_DIR, output_filename)
    pil_img.save(output_path, "JPEG", quality=95)
    print(f"Saved: {output_filename}")
 # --- GUI Application ---
 class ImageReviewer:
    def __init__(self, file_list):
        self.files = file_list
        self.index = 0
        self.current_shift = 0
        self.current_pil = None
        self.is_processing = False
        # Queue for pre-fetched images (index, image)
        self.prefetch_queue = Queue(maxsize=1)
        # Queue for manual re-processing results
        self.manual_queue = Queue()
        # Setup GUI
        self.root = tk.Tk()
        self.root.title("PDF Cropper")
        self.root.geometry("+100+100")
        self.label_img = tk.Label(self.root)
        self.label_img.pack()
        self.label_info = tk.Label(self.root, text="", font=("Arial", 12, "bold"))
        self.label_info.pack(pady=5)
        # Bindings
        self.root.bind('<Return>', self.on_next)
        self.root.bind('n', lambda e: self.on_shift(50))
        self.root.bind('N', lambda e: self.on_shift(100))
        self.root.bind('t', lambda e: self.on_shift(-50))
        # Start background pre-fetcher
        self.bg_thread = Thread(target=self.prefetch_worker, daemon=True)
        self.bg_thread.start()
        # Load first image
        self.load_current_image()
        self.root.lift()
        self.root.focus_force()
        self.root.mainloop()
    def prefetch_worker(self):
        """Background thread to process the NEXT image constantly."""
        idx_to_process = 0
        while True:
            target = self.index + 1
            if target < len(self.files):
                if idx_to_process != target:
                    fname = self.files[target]
                    img = process_single_pdf(fname, shift_offset=0)
                    if img:
                        self.prefetch_queue.put((target, img)) # Blocks if full
                        idx_to_process = target
            # Crucial fix: Sleep briefly to release CPU
            time.sleep(0.1)
    def load_current_image(self, use_prefetch=False):
        if self.index >= len(self.files):
            print("All files processed.")
            self.root.destroy()
            return
        filename = self.files[self.index]
        self.is_processing = False
        img_found = None
        if use_prefetch and not self.prefetch_queue.empty():
            q_idx, q_img = self.prefetch_queue.queue[0]
            if q_idx == self.index:
                _, img_found = self.prefetch_queue.get()
                self.current_shift = 0
                print(f"Loaded {filename} from prefetch.")
        if img_found:
            self.current_pil = img_found
            save_image(self.current_pil, filename)
            self.update_display(filename)
        else:
            # Not in queue (first load or queue mismatch), process manually
            self.trigger_processing(filename, self.current_shift)
    def trigger_processing(self, filename, shift):
        """Starts a thread to process image so GUI doesn't freeze."""
        self.is_processing = True
        self.label_info.configure(text=f"Processing {filename} (Shift {shift})... Please wait.", fg="red")
        def worker():
            img = process_single_pdf(filename, shift)
            self.manual_queue.put(img)
        Thread(target=worker, daemon=True).start()
        self.check_manual_queue(filename)
    def check_manual_queue(self, filename):
        """Polls the manual queue for result."""
        try:
            img = self.manual_queue.get_nowait()
            self.current_pil = img
            if self.current_pil:
                save_image(self.current_pil, filename)
                self.update_display(filename)
            else:
                print(f"Failed to process {filename}, skipping.")
                self.index += 1
                self.load_current_image(use_prefetch=True)
            self.is_processing = False
        except Empty:
            # Check again in 100ms
            self.root.after(100, lambda: self.check_manual_queue(filename))
    def update_display(self, filename):
        if self.current_pil:
            tk_image = ImageTk.PhotoImage(self.current_pil)
            self.label_img.configure(image=tk_image)
            self.label_img.image = tk_image
            self.label_info.configure(
                text=f"[{self.index+1}/{len(self.files)}] {filename} | Shift: {self.current_shift}px\n"
                     f"Enter: Next | n: +50 | N: +100 | t: -50",
                fg="black"
            )
    def on_shift(self, amount):
        if self.is_processing:
            return # Ignore keys while processing
        self.current_shift += amount
        print(f"Applying shift: {self.current_shift}")
        self.trigger_processing(self.files[self.index], self.current_shift)
    def on_next(self, event):
        if self.is_processing:
            return
        self.index += 1
        self.current_shift = 0
        self.load_current_image(use_prefetch=True)
 # --- Entry Point ---
 if __name__ == "__main__":
    if not files:
        print("No PDF files found.")
    else:
        app = ImageReviewer(files)
--- a/enonce_info.py
+++ b/enonce_info.py
@ -0,0 +1,119 @@
 import sys
 import os
 import glob
 import json
 import urllib.request
 import re
 def replace_dots(text):
    # (?m) enables multiline mode so ^ matches start of each line
    return re.sub(r"(?m)^(\s*.)\.", r"\1)", text)
 def format_indices(indices):
    """Converts [2, 1] to '2)a)' based on requirements."""
    if not indices:
        return ""
    # First level: numeric (1 -> 1))
    res = f"{indices[0]})"
    # Second level: alpha (1 -> a))
    if len(indices) > 1:
        res += f"{chr(96 + indices[1])})"
    return res
 def process_directory(directory):
    # Find the first .tex file in the directory
    tex_files = glob.glob(os.path.join(directory, "*.tex"))
    if not tex_files:
        print(f"No .tex file found in {directory}")
        return
    tex_file = tex_files[0]
    # Prepare output directories
    paths = {
        'Text': os.path.join(directory, "Text"),
        'Sol': os.path.join(directory, "Sol"),
        'Persp': os.path.join(directory, "Persp")
    }
    for p in paths.values():
        os.makedirs(p, exist_ok=True)
    labels_file = os.path.join(directory, "labels")
    current_ex_num = 1
    with open(tex_file, 'r', encoding='utf-8') as f_in, \
         open(labels_file, 'w', encoding='utf-8') as f_labels:
        for line in f_in:
            if line.startswith("%%SHEETINFO :"):
                try:
                    json_str = line.split(":", 1)[1].strip()
                    data = json.loads(json_str)
                    # 2. Handle Labels
                    indexes = data.get('indexes', [])
                    if not indexes:
                        f_labels.write(f"Ex {current_ex_num}\n")
                    else:
                        for item in indexes:
                            suffix = format_indices(item['indices'])
                            if suffix != "":
                                f_labels.write(f"Ex {current_ex_num} : {suffix}\n")
                            else:
                                f_labels.write(f"Ex {current_ex_num}\n")
                    # Construct 'ids' parameter
                    ex_id = str(data['id'])
                    selection = data.get('select')
                    if selection is not None:
                        # Format: "ID.sel1,sel2"
                        sel_s = [i+1 for i in selection]
                        ids = f"{ex_id}.{','.join(map(str, sel_s))}"
                    else:
                        ids = ex_id
                    # Construct URL
                    url = f"http://localhost:8080/exercices/emacs/{ids}?pretty=true&all=true&persp=true"
                    # Perform GET request
                    with urllib.request.urlopen(url) as response:
                        content = response.read().decode('utf-8')
                    # 4. Split and Save content
                    parts = content.split('###')
                    # Ensure we have at least 3 parts, pad if necessary to avoid crashes
                    while len(parts) < 3:
                        parts.append("")
                    base_filename = f"Ex {current_ex_num}"
                    with open(os.path.join(paths['Text'], base_filename), 'w', encoding='utf-8') as f:
                        f.write(replace_dots(parts[0].strip("\n")))
                    with open(os.path.join(paths['Sol'], base_filename), 'w', encoding='utf-8') as f:
                        f.write(replace_dots(parts[1].strip("\n")))
                    with open(os.path.join(paths['Persp'], base_filename), 'w', encoding='utf-8') as f:
                        f.write(replace_dots(parts[2].strip("\n")))
                    current_ex_num += 1
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in line: {line.strip()}")
                except Exception as e:
                    print(f"Error processing {ids}: {e}")
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python script.py <Dir>")
        sys.exit(1)
    process_directory(sys.argv[1])
--- a/gemini-batch.py
+++ b/gemini-batch.py
@ -0,0 +1,166 @@
 import sys
 import os
 import time
 from google import genai
 from google.genai import types
 import base64
 from pathlib import Path
 if len(sys.argv) < 2:
    sys.exit("Usage: python script.py <directory_path>")
 INPUT_DIR = sys.argv[1]
 CUTLEFT_DIR = os.path.join(INPUT_DIR, 'Cutleft')
 MODEL_ID = "gemini-3-flash-preview"
 api_key="REMOVED_API_KEY"
 my_prompt = """I'm giving you an image of the left columns of a written exam.
 Students answer several exercises, which can have several questions.
 The image consists of several columns, separated by vertical black
 lines. The image should be read top to bottom and then left to right,
 meaning first column, then second column, etc.
 In their sheet, students delimit exercises and questions using
 delimiters such as `Ex 1`, or `Exercice 1`, and `1)` or `a)`. You need
 to give me the bounding boxes of each delimiter.
 When giving the bounding box of the first question of an exercise, the
 box should be large enough to contain both the exercice label
 (`Exercice i`) and the question label (`1)`) parts.
 You also need to give me the student name. It should appear on the top
 left of the image. Disregard any mention of `MPSI 3`, it is their
 class. A list of possible student names will be given below.
 You will answer with a JSON object, containing a `name` field with the
 name, and a `list` field, with the list of the bounding boxes and
 their labels. The box_2d should be [ymin, xmin, ymax, xmax] normalized
 to 0-1000.
 Here is an example :
 {\"name\" : \"John Doe\", \"list\" : [{\"box_2d\": (10, 20, 30, 40), \"label\" : \"Ex 1 : 1)\"}]}
 Do not provide a box_2d for the name. Only for the labels.
 You may find the same label present several times, as a student either
 recall the current label on a new page, or adds content to its answer
 later on. Give the position of each instance of each label.
 For this exam you should look for the labels given below, separated by
 newlines. A student need not have answered every question, so some may
 be missing.
 ##labels##
 Here's a list of the names of the students, pick the one that matches
 the best or `\"Unknown\"` if you cannot read the name
 ##names##"""
 from tqdm import tqdm
 def process_batch(directory):
    client = genai.Client(api_key=api_key)
    image_files = list(Path(directory).glob("*.jpg"))
    if not image_files:
        print("No .jpg files found.")
        return
    # 1. Upload images to File API (Batch requirement)
    batch_requests = []
    print(f"Uploading {len(image_files)} images to File API...")
    for img_path in tqdm(image_files, unit="img"):
        # Upload file
        file_ref = client.files.upload(path=img_path)
        # Construct Request for JSONL
        # Note: We must serialize config manually for the JSONL body
        req_body = {
            "contents": [
                {"role": "user", "parts": [
                    {"fileData": {"mimeType": file_ref.mime_type, "fileUri": file_ref.uri}},
                    {"text": my_prompt}
                ]}
            ],
            "generationConfig": {
                "temperature": 1.0,
                "topP": 0.95,
                "maxOutputTokens": 65535,
                "thinkingConfig": {"thinkingBudget": -1}
            },
            "safetySettings": [
                {"category": cat, "threshold": "BLOCK_NONE"}
                for cat in ["HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_DANGEROUS_CONTENT",
                            "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_HARASSMENT"]
            ]
        }
        # Batch Request Entry
        batch_requests.append({
            "custom_id": img_path.name,
            "method": "POST",
            "url": f"/v1beta/models/{MODEL_ID}:generateContent",
            "body": req_body
        })
    # 2. Create and Upload Batch Source File (JSONL)
    batch_file_path = os.path.join(INPUT_DIR, "batch_input.jsonl")
    with open(batch_file_path, "w") as f:
        for req in batch_requests:
            f.write(json.dumps(req) + "\n")
    batch_input_file = client.files.upload(path=batch_file_path)
    # 3. Submit Batch Job
    print("Submitting batch job...")
    job = client.batches.create(
        model=MODEL_ID,
        src=batch_input_file.name
    )
    print(f"Batch Job ID: {job.name}")
    # 4. Poll for Completion
    pbar = tqdm(desc="Processing Batch", unit="poll")
    while True:
        job = client.batches.get(name=job.name)
        if job.state == "ACTIVE":
            pbar.set_description("Processing")
        elif job.state == "SUCCEEDED" or job.state == "FAILED":
            break
        pbar.update(1)
        time.sleep(10) # Poll every 10 seconds
    pbar.close()
    if job.state == "FAILED":
        print(f"Batch job failed: {job.error}")
        return
    # 5. Retrieve and Save Results
    print("Downloading results...")
    # The output file is a remote URI, we download its content
    output_content = client.files.content(path=job.output_file.name)
    # Parse JSONL output and map back to files
    # Output format: {"custom_id": "...", "response": {...}}
    results_saved = 0
    for line in output_content.decode("utf-8").splitlines():
        if not line: continue
        result = json.loads(line)
        filename = result.get("custom_id")
        if filename:
            output_path = Path(directory) / f"{filename}.json"
            with open(output_path, "w", encoding="utf-8") as f:
                # Save the full response part
                json.dump(result.get("response", {}), f, indent=2)
            results_saved += 1
    print(f"Batch complete. Saved {results_saved} result files.")
 process_batch(CUTLEFT_DIR)
--- a/gemini.py
+++ b/gemini.py
@ -0,0 +1,119 @@
 from google import genai
 from google.genai import types
 import base64
 from pathlib import Path
 MODEL_ID = "gemini-3-flash-preview"
 api_key="REMOVED_API_KEY"
 my_prompt = """I'm giving you an image of the left columns of a written exam.
 Students answer several exercises, which can have several questions.
 The image consists of several columns, separated by vertical black
 lines. The image should be read top to bottom and then left to right,
 meaning first column, then second column, etc.
 In their sheet, students delimit exercises and questions using
 delimiters such as `Ex 1`, or `Exercice 1`, and `1)` or `a)`. You need
 to give me the bounding boxes of each delimiter.
 When giving the bounding box of the first question of an exercise, the
 box should be large enough to contain both the exercice label
 (`Exercice i`) and the question label (`1)`) parts.
 You also need to give me the student name. It should appear on the top
 left of the image. Disregard any mention of `MPSI 3`, it is their
 class. A list of possible student names will be given below.
 You will answer with a JSON object, containing a `name` field with the
 name, and a `list` field, with the list of the bounding boxes and
 their labels. The box_2d should be [ymin, xmin, ymax, xmax] normalized
 to 0-1000.
 Here is an example :
 {\"name\" : \"John Doe\", \"list\" : [{\"box_2d\": (10, 20, 30, 40), \"label\" : \"Ex 1 : 1)\"}]}
 Do not provide a box_2d for the name. Only for the labels.
 You may find the same label present several times, as a student either
 recall the current label on a new page, or adds content to its answer
 later on. Give the position of each instance of each label.
 For this exam you should look for the labels given below, separated by
 newlines. A student need not have answered every question, so some may
 be missing.
 ##labels##
 Here's a list of the names of the students, pick the one that matches
 the best or `\"Unknown\"` if you cannot read the name
 ##names##"""
 from pydantic import BaseModel, Field
 from typing import List
 class BoxItem(BaseModel):
    box_2d: List[int] = Field(description="Bounding box coordinates (e.g., [ymin, xmin, ymax, xmax])")
    label: str = Field(description="The label associated with the specific box")
 class AnnotationData(BaseModel):
    name: str = Field(description="The name identifier")
    list: List[BoxItem] = Field(description="List of bounding box items")
 def generate_request(file, labels):
    """Generates request for Gemini."""
    image_path = Path(file)
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_bytes(
                    data=image_path.read_bytes(),
                    mime_type="image/jpeg"
                ),
                types.Part.from_text(text=my_prompt + labels),
            ],
        )
    ]
    generate_content_config = types.GenerateContentConfig(
        temperature=1.0,
        top_p=0.95,
        seed=0,
        max_output_tokens=65535,
        response_mime_type= "application/json",
        response_json_schema= AnnotationData.model_json_schema(),
        # Thinking config is not compatible with response_json ? Unsure.
        # thinking_config=types.ThinkingConfig(
          # thinking_budget=-1,
        # ),
        # thinking_config=types.ThinkingConfig(
            # include_thoughts=True,
            # thinking_budget=1024, # Optimized for Gemini 3 capabilities
        # ),
    )
    return (contents, generate_content_config)
 import sys
 import os
 import time
 if len(sys.argv) < 2:
    sys.exit("Usage: python script.py Staging/cutleft1000.jpg labels")
 INPUT_FILE = sys.argv[1]
 contents, config = generate_request(INPUT_FILE)
 client = genai.Client(api_key=api_key)
 for chunk in client.models.generate_content_stream(
    model=MODEL_ID,
    contents=contents,
    config=config,
 ):
    if chunk.text:
        print(chunk.text, end="", flush=True)
--- a/gemini_dir_batching.py
+++ b/gemini_dir_batching.py
@ -0,0 +1,170 @@
 from google import genai
 from google.genai import types
 import base64
 from pathlib import Path
 from pydantic import BaseModel, Field
 from typing import List
 import sys
 import os
 import time
 import json
 import argparse
 MODEL_ID = "gemini-3-flash-preview"
 api_key="REMOVED_API_KEY"
 my_prompt = """I'm giving you an image of the left columns of a written exam.
 Students answer several exercises, which can have several questions.
 The image consists of several columns, separated by vertical black
 lines. The image should be read top to bottom and then left to right,
 meaning first column, then second column, etc.
 In their sheet, students delimit exercises and questions using
 delimiters such as `Ex 1`, or `Exercice 1`, and `1)` or `a)`. You need
 to give me the bounding boxes of each delimiter.
 When giving the bounding box of the first question of an exercise, the
 box should be large enough to contain both the exercice label
 (`Exercice i`) and the question label (`1)`) parts.
 You also need to give me the student name. It should appear on the top
 left of the image. Disregard any mention of `MPSI 3`, it is their
 class. A list of possible student names will be given below.
 You will answer with a JSON object, containing a `name` field with the
 name, and a `list` field, with the list of the bounding boxes and
 their labels. The box_2d should be [ymin, xmin, ymax, xmax] normalized
 to 0-1000.
 Here is an example :
 {\"name\" : \"John Doe\", \"list\" : [{\"box_2d\": (10, 20, 30, 40), \"label\" : \"Ex 1 : 1)\"}]}
 Do not provide a box_2d for the name. Only for the labels.
 You may find the same label present several times, as a student either
 recall the current label on a new page, or adds content to its answer
 later on. Give the position of each instance of each label.
 For this exam you should look for the labels given below, separated by
 newlines. A student need not have answered every question, so some may
 be missing.
 ##labels##
 Here's a list of the names of the students, pick the one that matches
 the best or `\"Unknown\"` if you cannot read the name
 ##names##"""
 class BoxItem(BaseModel):
    box_2d: List[int] = Field(description="Bounding box coordinates (e.g., [ymin, xmin, ymax, xmax])")
    label: str = Field(description="The label associated with the specific box")
 class AnnotationData(BaseModel):
    name: str = Field(description="The name identifier")
    list: List[BoxItem] = Field(description="List of bounding box items")
 def generate_request(file, labels, names):
    """Generates request for Gemini."""
    image_path = Path(file)
    text = my_prompt.replace("##labels##",labels).replace("##names##", names)
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_bytes(
                    data=image_path.read_bytes(),
                    mime_type="image/jpeg"
                ),
                types.Part.from_text(text=text),
            ],
        )
    ]
    generate_content_config = types.GenerateContentConfig(
        temperature=1.0,
        top_p=0.95,
        seed=0,
        max_output_tokens=65535,
        response_mime_type= "application/json",
        response_json_schema= AnnotationData.model_json_schema(),
    )
    return (contents, generate_content_config)
 # Argument Parsing
 parser = argparse.ArgumentParser(description="Process a directory or specific file using Gemini.")
 parser.add_argument("input_path", help="The input directory or specific file (e.g., Dir/File.pdf)")
 parser.add_argument("--overwrite", action="store_true", help="Regenerate output even if it exists")
 args = parser.parse_args()
 input_arg = Path(args.input_path)
 image_files = []
 # Logic to handle Directory vs File argument
 if input_arg.is_file():
    # If argument is Dir/Copiedd.pdf
    INPUT_DIR = input_arg.parent
    CUTLEFT_DIR = INPUT_DIR / 'Cutleft'
    # Look for matching .jpg in Cutleft (e.g., Copiedd.jpg)
    target_image = CUTLEFT_DIR / f"{input_arg.stem}.jpg"
    if target_image.exists():
        image_files = [target_image]
    else:
        print(f"Error: Corresponding image {target_image} not found.")
        sys.exit(1)
 else:
    # If argument is just Dir
    INPUT_DIR = input_arg
    CUTLEFT_DIR = INPUT_DIR / 'Cutleft'
    image_files = sorted(list(CUTLEFT_DIR.glob("*.jpg")))
 labels = (INPUT_DIR / "labels").read_text()
 names = (INPUT_DIR / "names").read_text()
 client = genai.Client(api_key=api_key)
 # Target > 3.0s per request to stay under 20 RPM
 TARGET_INTERVAL = 3.5
 from concurrent.futures import ThreadPoolExecutor
 def process_image(image_file):
    start_time = time.time()
    base_name, _ = os.path.splitext(image_file.name)
    output_json = os.path.join(INPUT_DIR, f"{base_name}.json")
    # Skip if already processed unless overwrite is enabled
    if os.path.exists(output_json) and not args.overwrite:
        print(f"Skipping {image_file.name}, output exists.")
        return
    print(f"Processing {image_file.name}...")
    try:
        # Prepare and execute request
        contents, config = generate_request(image_file, labels, names)
        response = client.models.generate_content(
            model=MODEL_ID,
            contents=contents,
            config=config
        )
        annota = AnnotationData.model_validate_json(response.text)
        # Save result
        with open(output_json, "w", encoding="utf-8") as f:
            json.dump(annota.model_dump(), f, indent=2)
    except Exception as e:
        print(f"Error processing {image_file.name}: {e}")
    # Rate Limiting (Note: This limits per-thread, not global total)
    elapsed = time.time() - start_time
    time.sleep(max(0, TARGET_INTERVAL - elapsed))
 # Run with 6 threads
 with ThreadPoolExecutor(max_workers=6) as executor:
    executor.map(process_image, image_files)
--- a/giving_names.py
+++ b/giving_names.py
@ -0,0 +1,48 @@
 import os
 import sys
 import json
 import shutil
 import re
 def main():
    if len(sys.argv) < 2:
        print("Usage: python rename_copies.py <directory_path>")
        sys.exit(1)
    work_dir = sys.argv[1]
    target_subdir = os.path.join(work_dir, "Copies annotées")
    # Create destination folder if it doesn't exist
    os.makedirs(target_subdir, exist_ok=True)
    # Regex to match "CopieXX.json" and capture XX
    pattern = re.compile(r"^Copie(\d+)\.json$")
    for filename in os.listdir(work_dir):
        match = pattern.match(filename)
        if match:
            copie_id = match.group(1)
            json_path = os.path.join(work_dir, filename)
            source_folder = os.path.join(work_dir, f"Anot_Copie{copie_id}")
            # Check if corresponding folder exists
            if os.path.isdir(source_folder):
                try:
                    with open(json_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        name = data.get("name", "Unknown").strip()
                        # Sanitize filename (remove characters invalid in paths)
                        safe_name = re.sub(r'[<>:"/\\|?*]', '', name)
                        new_folder_name = f"{safe_name} ({copie_id})"
                        dest_path = os.path.join(target_subdir, new_folder_name)
                        print(f"Moving '{source_folder}' -> '{dest_path}'")
                        shutil.move(source_folder, dest_path)
                except Exception as e:
                    print(f"Error processing {filename}: {e}")
 if __name__ == "__main__":
    main()
--- a/grouping.py
+++ b/grouping.py
@ -0,0 +1,248 @@
 import os
 import json
 import re
 import sys
 import shutil
 from collections import defaultdict
 from PIL import Image, ImageDraw, ImageFont
 from pdf2image import convert_from_path, pdfinfo_from_path
 # Configuration
 DPI = 200  # Good balance for readability and size
 A4_HEIGHT_INCHES = 11.69
 FULL_PAGE_PX = int(A4_HEIGHT_INCHES * DPI)
 MAX_GROUP_HEIGHT = 2.5 * FULL_PAGE_PX
 MAX_GROUP_COUNT = 15
 SEPARATOR_HEIGHT = 20
 LABEL_HEIGHT = 50
 MAX_FILE_SIZE_BYTES = 2.5 * 1024 * 1024  # 2MB
 # def get_pdf_height(path):
 #     """Returns height in pixels at defined DPI without rendering."""
 #     try:
 #         info = pdfinfo_from_path(path)
 #         # info["Page size"] is usually "width height pts"
 #         # 1 pt = 1/72 inch
 #         # We assume single page PDFs as per prompt implication, or take the first page
 #         pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0
 #         return int((pts_height / 72.0) * DPI)
 #     except Exception as e:
 #         print(f"Error reading {path}: {e}")
 #         return 0
 def get_pdf_height(path):
    """Returns total height of all pages in pixels at defined DPI."""
    try:
        info = pdfinfo_from_path(path)
        # Get page count (default to 1)
        num_pages = int(info["Pages"]) if "Pages" in info else 1
        # 1 pt = 1/72 inch
        pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0
        # Height of one page in pixels
        single_page_px = int((pts_height / 72.0) * DPI)
        # Return total height
        return single_page_px * num_pages
    except Exception as e:
        print(f"Error reading {path}: {e}")
        return 0
 def collect_files(root_dir):
    """
    Scans Dir/Copiedd/identifier.pdf
    Returns dict: {identifier: [(dd, path, height), ...]}
    """
    data = defaultdict(list)
    # Regex to match 'Copie' followed by 2 digits
    folder_pattern = re.compile(r'Copie(\d{2})')
    for root, dirs, files in os.walk(root_dir):
        folder_name = os.path.basename(root)
        match = folder_pattern.match(folder_name)
        if match:
            dd = match.group(1)
            for file in files:
                if file.lower().endswith('.pdf'):
                    identifier = os.path.splitext(file)[0]
                    full_path = os.path.join(root, file)
                    # Calculate height (c)
                    height = get_pdf_height(full_path)
                    # Store triple (a, b, c)
                    data[identifier].append((dd, full_path, height))
    return data
 def group_files(file_list):
    """Groups files based on constraints."""
    sorted_files = sorted(file_list, key=lambda x: x[0])
    groups = []
    current_group = []
    current_height = 0
    for item in sorted_files:
        dd, path, height = item
        # Calculate added height (image + separator + approx text space)
        # We add separator height only if it's not the first image
        added_overhead = SEPARATOR_HEIGHT + 30 if current_group else 0
        # Check conditions
        if (len(current_group) >= MAX_GROUP_COUNT or
            (current_height + height + added_overhead) > MAX_GROUP_HEIGHT):
            # Push current group and start new
            if current_group:
                groups.append(current_group)
            current_group = []
            current_height = 0
            added_overhead = 0 # Reset for first file of new group
        current_group.append(item)
        current_height += height + added_overhead
    if current_group:
        groups.append(current_group)
    return groups
 def stitch_pdf_pages(images_list):
    """Vertically concatenates a list of PIL images with no separator."""
    if not images_list:
        return None
    if len(images_list) == 1:
        return images_list[0]
    max_width = max(img.width for img in images_list)
    total_height = sum(img.height for img in images_list)
    combined = Image.new('RGB', (max_width, total_height), 'white')
    y_offset = 0
    for img in images_list:
        combined.paste(img, (0, y_offset))
        y_offset += img.height
    return combined
 def create_jpg(identifier, group_index, group, root_dir):
    images = []
    metadata = []  # To store (id, h_min, h_max)
    # Render PDFs to images
    for dd, path, _ in group:
        try:
            # Convert pdf to image
            imgs = convert_from_path(path, dpi=DPI)
            # if imgs:
                # images.append((dd, imgs[0])) # Assume 1 page per pdf !! ??
            if imgs:
                # Concatenate multi-page PDFs into one single image object
                combined_img = stitch_pdf_pages(imgs)
                if combined_img:
                    images.append((dd, combined_img))
        except Exception as e:
            print(f"Failed to convert {path}: {e}")
    if not images:
        return
    # Calculate total canvas size
    total_width = max(img.width for _, img in images)
    total_height = sum(img.height for _, img in images) + ((len(images) - 1) * SEPARATOR_HEIGHT)
    # Add space for text (approx 40px per label)
    total_height += len(images) * LABEL_HEIGHT
    canvas = Image.new('RGB', (total_width, total_height), 'white')
    draw = ImageDraw.Draw(canvas)
    # Try loading a font, fallback to default
    try:
        font = ImageFont.truetype("DejaVuSans.ttf", 40)
    except IOError:
        print("font not found")
        font = ImageFont.load_default()
    y_offset = 0
    for i, (dd, img) in enumerate(images):
        # Draw separator if not first image
        if i > 0:
            draw.rectangle([0, y_offset, total_width, y_offset + SEPARATOR_HEIGHT], fill='black')
            y_offset += SEPARATOR_HEIGHT
        # Draw Text (dd)
        text = f"ID: {dd}"
        draw.text((10, y_offset + 5), text, fill='black', font=font)
        y_offset += LABEL_HEIGHT # Space for text
        # Record Image Coordinates
        h_min = y_offset
        h_max = y_offset + img.height
        metadata.append((dd, h_min, h_max))
        # Draw Image
        x_pos = 0
        canvas.paste(img, (x_pos, y_offset))
        y_offset += img.height
    target_folder = os.path.join(root_dir, identifier)
    os.makedirs(target_folder, exist_ok=True)
    # Save JSON metadata
    json_filename = f"Group_{group_index+1}.json"
    json_path = os.path.join(target_folder, json_filename)
    with open(json_path, 'w') as f:
        json.dump(metadata, f)
    # Save with size constraints
    output_filename = f"Group_{group_index+1}.jpg"
    output_path = os.path.join(target_folder, output_filename)
    quality = 90
    while quality > 10:
        canvas.save(output_path, "JPEG", quality=quality, optimize=True)
        if os.path.getsize(output_path) <= MAX_FILE_SIZE_BYTES:
            if quality < 90:
                print("quality : ", quality)
            break
        quality -= 5
    print(f"Saved {output_path} ({os.path.getsize(output_path)/1024/1024:.2f} MB)")
 def main():
    if len(sys.argv) < 2:
        print("Usage: python app.py <Path_to_Dir>")
        sys.exit(1)
    root_dir = sys.argv[1]
    print("Scanning files...")
    data = collect_files(root_dir)
    print(f"Found {len(data)} identifiers. Processing...")
    for identifier, files_info in data.items():
        # Clear output directory if it exists
        target_folder = os.path.join(root_dir, identifier)
        if os.path.exists(target_folder):
            shutil.rmtree(target_folder)
        os.makedirs(target_folder, exist_ok=True)
        # files_info is list of (dd, path, height)
        file_groups = group_files(files_info)
        for idx, group in enumerate(file_groups):
            create_jpg(identifier, idx, group, root_dir)
    print("Done.")
 if __name__ == "__main__":
    main()
--- a/page_splitter.py
+++ b/page_splitter.py
@ -0,0 +1,377 @@
 import fitz  # PyMuPDF
 import tkinter as tk
 from tkinter import messagebox
 from PIL import Image, ImageTk, ImageDraw
 import sys
 import os
 import glob
 import shutil
 from pypdf import PdfReader, PdfWriter
 # --- Constants ---
 # Conversion factor: 1 cm to points (1 inch = 2.54 cm, 72 points = 1 inch)
 CM_TO_POINTS = (1 / 2.54) * 72
 def list_pdf_files(directory):
    return list(reversed(sorted(glob.glob(os.path.join(directory, "*.pdf")))))
 class PDFPreviewer:
    def setup_next_file(self):
        self.num += 1
        if len(self.inputs) == 0:
            return False
        self.pdf_path = self.inputs.pop()
        self.file_rotation = 0
        self.base_name = os.path.splitext(os.path.basename(self.pdf_path))[0]
        self.split_dir = f"{self.base_name}_split"
        self.reorder_dir = f"{self.base_name}_reorder"
        if self.output_dir is None:
            self.final_file = f"{self.base_name}_final"
        else:
            self.final_file = f"{self.output_dir}/Copie{self.num:02}.pdf"
        self.current_page_index = 0
        self.page_settings = []
        self.processing = False  # Flag to prevent multiple finish calls
        try:
            self.doc = fitz.open(self.pdf_path)
        except Exception as e:
            messagebox.showerror("Error", f"Failed to open PDF file: {e}")
            self.master.destroy()
            return
        self.master.title(f"PDF Splitter - {os.path.basename(self.pdf_path)}")
        return True
    def __init__(self, master, path):
        """
        Initializes the application.
        Args:
            master (tk.Tk): The root Tkinter window.
            pdf_path (str): The path to the input PDF file.
        """
        if not os.path.exists(path):
            messagebox.showerror("Error", f"File not found: {path}")
            master.destroy()
            return
        if os.path.isdir(path):
            self.inputs = list_pdf_files(path)
            self.output_dir = f"{path}_out"
        else:
            self.inputs = [path]
            self.output_dir = None
        self.master = master
        self.num = 0
        self.global_rotation = 0 # Rotation appliquée à tous les fichiers
        self.setup_next_file()
        self._resize_job = None  # For debouncing resize events
        self._initialize_current_page_settings()
        # --- UI Setup ---
        # Set a reasonable initial size for the window
        self.master.geometry("800x1000")
        instructions = (
            "← / → : Move line 1cm left/right\n"
            "'c': Rotate page 180°, 'C' : rotate all pages, ',' : rotate all files\n"
            "t s r n: keep left, next page, keep none, keep right\n"
            "z: send this page to the end\n"
        )
        self.info_label = tk.Label(master, text=instructions, justify=tk.LEFT)
        self.info_label.pack(pady=5, side=tk.TOP)
        self.page_label = tk.Label(master, text="", font=("Helvetica", 12))
        self.page_label.pack(pady=5, side=tk.TOP)
        # Canvas for PDF page preview
        self.canvas = tk.Canvas(master, bg="gray")
        self.canvas.pack(fill="both", expand=True)
        # --- Bindings ---
        self.master.bind("<Left>", self.move_line_left)
        self.master.bind("<Right>", self.move_line_right)
        self.master.bind("<Return>", self.confirm_and_next_page)
        self.master.bind("c", self.rotate_page)
        self.master.bind("C", self.rotate_all_pages)
        self.master.bind(",", self.rotate_all_files)
        self.master.bind("t", self.keep_left)
        self.master.bind("n", self.keep_right)
        self.master.bind("s", self.confirm_and_next_page)
        self.master.bind("r", self.discard_page)
        self.master.bind("z", self.send_page_end)
        # Bind the resize event on the canvas
        self.canvas.bind("<Configure>", self.on_resize)
        self.current_zoom = 1.0
    def on_resize(self, event):
        """
        Handles window resize events by reloading the page.
        Uses a "debounce" mechanism to avoid excessive redrawing.
        """
        if self._resize_job:
            self.master.after_cancel(self._resize_job)
        self._resize_job = self.master.after(250, self.load_page) # Redraw after 250ms of no resizing
    def _initialize_current_page_settings(self):
        """Initializes or resets the settings for the current page."""
        if self.current_page_index < len(self.doc):
            page = self.doc.load_page(self.current_page_index)
            self.current_line_x = page.rect.width / 2
            self.current_rotation = 0
    def load_page(self):
        """Loads and displays the current page on the canvas, scaled to fit."""
        if self.current_page_index >= len(self.doc):
            if not self.processing:
                self.processing = True
                self.finish_and_process()
            return
        page = self.doc.load_page(self.current_page_index)
        self.page_label.config(text=f"Page {self.current_page_index + 1} of {len(self.doc)}")
        # --- Calculate Scaling ---
        canvas_width = self.canvas.winfo_width()
        canvas_height = self.canvas.winfo_height()
        # Don't try to render if the canvas has no size yet.
        if canvas_width <= 1 or canvas_height <= 1:
            return
        page_rect = page.rect
        zoom_x = canvas_width / page_rect.width
        zoom_y = canvas_height / page_rect.height
        # Use 98% of the smallest zoom factor to leave a small margin
        self.current_zoom = min(zoom_x, zoom_y) * 0.98
        # --- Render Page ---
        mat = fitz.Matrix(self.current_zoom, self.current_zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        # Apply rotation if needed *after* drawing the line
        if (self.current_rotation + self.file_rotation + self.global_rotation) % 360 != 0:
            img = img.rotate(180, expand=True)
        # --- Draw Line and Rotate ---
        draw = ImageDraw.Draw(img)
        # The line position is scaled by the same zoom factor
        line_x_scaled = self.current_line_x * self.current_zoom
        draw.line([(line_x_scaled, 0), (line_x_scaled, pix.height)], fill="red", width=3)
        # --- Display on Canvas ---
        self.photo_img = ImageTk.PhotoImage(img)
        self.canvas.delete("all")
        # Center the image on the canvas
        self.canvas.create_image(canvas_width / 2, canvas_height / 2, anchor="center",
                                 image=self.photo_img)
    def move_line_left(self, event=None):
        """Moves the split line to the left."""
        self.current_line_x = max(0, self.current_line_x - CM_TO_POINTS / 2)
        self.load_page()
    def move_line_right(self, event=None):
        """Moves the split line to the right."""
        page = self.doc.load_page(self.current_page_index)
        self.current_line_x = min(page.rect.width, self.current_line_x + CM_TO_POINTS / 2)
        self.load_page()
    def rotate_page(self, event=None):
        """Toggles the page rotation between 0 and 180 degrees."""
        self.current_rotation = 180 if self.current_rotation == 0 else 0
        self.load_page()
    def rotate_all_pages(self, event=None):
        """Toggles the page rotation between 0 and 180 degrees."""
        self.file_rotation = 180 if self.file_rotation == 0 else 0
        self.load_page()
    def rotate_all_files(self, event=None):
        """Toggles the page rotation between 0 and 180 degrees."""
        self.global_rotation = 180 if self.global_rotation == 0 else 0
        self.load_page()
    def keep_left(self, event=None):
        self.confirm_and_next_page(keep="left")
    def keep_right(self, event=None):
        self.confirm_and_next_page(keep="right")
    def discard_page(self, event=None):
        self.confirm_and_next_page(keep="none")
    def send_page_end(self, event=None):
        # Do nothing if we are already at or past the last page
        if self.current_page_index >= len(self.doc) - 1:
            return
        # Move the current page to the end of the document
        # -1 as the destination puts it after the last page
        self.doc.move_page(self.current_page_index, -1)
        # Initialize settings for the page that shifted into the current slot
        self._initialize_current_page_settings()
        # Reload the canvas to show the new page
        self.load_page()
    def confirm_and_next_page(self, event=None, keep="both"):
        """Saves the settings for the current page and moves to the next."""
        self.page_settings.append({
            "line_x": self.current_line_x,
            "rotation": self.current_rotation,
            "keep": keep
        })
        self.current_page_index += 1
        if self.current_page_index < len(self.doc):
            self._initialize_current_page_settings()
            self.load_page()
        else:
            self.finish_and_process()
            if self.setup_next_file():
                self._initialize_current_page_settings()
                self.load_page()
            else:
                self.master.destroy()
    def finish_and_process(self):
        """Starts the PDF splitting process."""
        self.split_pdf()
        self.reorder_pdfs()
        self.concate_files()
        self.remove_dirs()
    def split_filename_left(self, i):
        return os.path.join(self.split_dir, f"{self.base_name}_{i+1}l.pdf")
    def split_filename_right(self, i):
        return os.path.join(self.split_dir, f"{self.base_name}_{i+1}r.pdf")
    def reorder_filename(self, i):
        return os.path.join(self.reorder_dir, f"{self.base_name}_{i+1}.pdf")
    def clean_up_dir(self, dir, make=True):
        if make:
            os.makedirs(dir, exist_ok=True)
        pdf_files = glob.glob(os.path.join(dir, "*.pdf"))
        for pdf in pdf_files:
            try:
                os.remove(pdf)
            except Exception as e:
                print(f"Error deleting {pdf}: {e}")
    def remove_dirs(self):
        shutil.rmtree(self.split_dir)
        shutil.rmtree(self.reorder_dir)
    def split_pdf(self):
        """Splits each page of the PDF according to the saved settings."""
        print("Starting PDF processing...")
        self.clean_up_dir(self.split_dir)
        for i, settings in enumerate(self.page_settings):
            page = self.doc.load_page(i)
            line_x = settings['line_x']
            rotation_settings = settings['rotation']
            keep = settings['keep']
            rotation = (page.rotation + rotation_settings +
                        self.file_rotation + self.global_rotation) % 360
            # --- Create Left Part ---
            if rotation == 0:
                rect_left = fitz.Rect(0, 0, line_x, page.rect.height)
            else:
                rect_left = fitz.Rect(page.rect.width-line_x, 0, page.rect.width, page.rect.height)
            doc_left = fitz.open()
            page_left = doc_left.new_page(width=rect_left.width, height=rect_left.height)
            page_left.show_pdf_page(page_left.rect, self.doc, i, clip=rect_left)
            page_left.set_rotation(rotation)
            if keep == "both" or keep == "left":
                output_path_left = self.split_filename_left(i)
                doc_left.save(output_path_left)
                doc_left.close()
            # --- Create Right Part ---
            if rotation == 0:
                rect_right = fitz.Rect(line_x, 0, page.rect.width, page.rect.height)
            else:
                rect_right = fitz.Rect(0, 0, page.rect.width-line_x, page.rect.height)
            doc_right = fitz.open()
            page_right = doc_right.new_page(width=rect_right.width, height=rect_right.height)
            page_right.show_pdf_page(page_right.rect, self.doc, i, clip=rect_right)
            page_right.set_rotation(rotation)
            if keep == "both" or keep == "right":
                output_path_right = self.split_filename_right(i)
                doc_right.save(output_path_right)
                doc_right.close()
        self.doc.close()
        print(f"\nProcessing complete. Files are in '{self.split_dir}' directory.")
    def reorder_pdfs(self):
        """Reordonne les pages, si ce sont des copies doubles."""
        self.clean_up_dir(self.reorder_dir)
        ps = self.page_settings
        ri = 0
        i = 0
        while i < len(ps):
            # Si c'est une copie double
            if (ps[i]['keep'] == "both" or ps[i]['keep'] == "right") \
               and i < len(ps)-1 and (ps[i+1]['keep'] != "right"):
                shutil.copy2(self.split_filename_right(i), self.reorder_filename(ri))
                ri += 1
                if ps[i+1]['keep'] != "none":
                    shutil.copy2(self.split_filename_left(i+1), self.reorder_filename(ri))
                    ri += 1
                    if ps[i+1]['keep'] != "left":
                        shutil.copy2(self.split_filename_right(i+1), self.reorder_filename(ri))
                        ri += 1
                        if ps[i]['keep'] == "both":
                            shutil.copy2(self.split_filename_left(i), self.reorder_filename(ri))
                            ri += 1
                i += 2
            else:
                psk = ps[i]['keep']
                if psk == "left" or psk == "both":
                    shutil.copy2(self.split_filename_left(i), self.reorder_filename(ri))
                    ri += 1
                if psk == "right" or psk == "both":
                    shutil.copy2(self.split_filename_right(i), self.reorder_filename(ri))
                    ri += 1
                i += 1
    def concate_files(self):
        writer = PdfWriter()
        pdf_files = sorted(glob.glob(os.path.join(self.reorder_dir, "*.pdf")))
        for pdf in pdf_files:
            reader = PdfReader(pdf)
            for page in reader.pages:
                writer.add_page(page)
        if self.output_dir != None:
            os.makedirs(os.path.dirname(self.final_file), exist_ok=True)
        with open(self.final_file, "wb") as f:
            writer.write(f)
        print(f"Created merged PDF: {self.final_file}")
 if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script_name.py <path_to_pdf_file>")
        sys.exit(1)
    pdf_file_path = sys.argv[1]
    root = tk.Tk()
    app = PDFPreviewer(root, pdf_file_path)
    root.mainloop()
--- a/plotting.py
+++ b/plotting.py
@ -0,0 +1,272 @@
 import sys
 import json
 import threading
 import queue
 import subprocess
 import tkinter as tk
 from pathlib import Path
 from PIL import Image, ImageDraw, ImageFont, ImageTk
 from pypdf import PdfReader
 # --- Configuration & Globals ---
 padding = 60 # White margin to the right
 image_queue = queue.Queue(maxsize=5) # Buffer a few images ahead
 try:
    font = ImageFont.truetype("DejaVuSans.ttf", size=30)
 except OSError:
    font = ImageFont.load_default()
 # --- Processing Logic (Worker Thread) ---
 def page_number(b, nb_pages):
    column_width = 1000 // nb_pages
    center_x = (b[1] + b[3]) // 2
    return center_x // column_width
 def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages):
    """
    Draws boxes on the image and returns the PIL Image object.
    Does NOT display it.
    """
    im = Image.open(image_path)
    # Ensure image is loaded so we can pass it between threads safely
    im.load()
    width, height = im.size
    # Add white padding to the right
    new_im = Image.new(im.mode, (width + padding, height), "white")
    new_im.paste(im, (0, 0))
    draw = ImageDraw.Draw(new_im)
    bounding_boxes.sort(key=lambda b: (page_number(b["box_2d"], nb_pages), b["box_2d"][0]))
    last_label_index = -1
    for bbox in bounding_boxes:
        raw_y_min = int(bbox["box_2d"][0] * height / 1000)
        raw_x_min = int(bbox["box_2d"][1] * width  / 1000)
        raw_y_max = int(bbox["box_2d"][2] * height / 1000)
        raw_x_max = int(bbox["box_2d"][3] * width  / 1000)
        abs_y_min = max(0, raw_y_min - 10)
        abs_x_min = max(0, raw_x_min - 10)
        abs_y_max = min(height, raw_y_max + 10)
        abs_x_max = min(width, raw_x_max + 10)
        color = "black"
        label = bbox.get("label")
        if label and label in all_labels:
            current_index = all_labels.index(label)
            if current_index < last_label_index:
                color = "red"
            last_label_index = current_index
        draw.rectangle(
            ((abs_x_min, abs_y_min), (abs_x_max, abs_y_max)),
            outline=color,
            width=4,
        )
        if label:
            # draw.text((abs_x_min + 8, abs_y_min + 6), label, fill=color, font=font)
            if abs_y_min > 80:
                draw.text((abs_x_min + 8, abs_y_min - 30), label, fill=color, font=font)
            else:
                draw.text((abs_x_min + 8, abs_y_max + 6), label, fill=color, font=font)
    return new_im
 def worker_thread(base_dir, files_to_process, all_labels):
    """
    Iterates through files, processes them, and puts them in the queue.
    """
    for img_path in files_to_process:
        json_path = base_dir / f"{img_path.stem}.json"
        pdf_path = base_dir / f"{img_path.stem}.pdf"
        nb_pages = 1
        if pdf_path.exists():
            try:
                nb_pages = len(PdfReader(pdf_path).pages)
            except Exception:
                pass
        if json_path.exists():
            try:
                with open(json_path, 'r') as f:
                    json_result = json.load(f)
                bb_list = json_result.get("list", [])
                print(f"Processing {img_path.name}...")
                # Draw boxes
                pil_image = prepare_image(str(img_path), bb_list, all_labels, nb_pages)
                # Block if queue is full (waiting for user to view)
                image_queue.put((pil_image, json_path))
            except Exception as e:
                print(f"Error processing {img_path.name}: {e}")
    # Sentinel to indicate finished
    image_queue.put((None, None))
 # --- GUI Logic (Main Thread) ---
 class ImageViewer:
    def __init__(self, root, base_dir):
        self.root = root
        self.base_dir = base_dir
        self.root.title("Bounding Box Viewer")
        # UI Elements
        self.label = tk.Label(root, text="Waiting for images...")
        self.label.pack(expand=True, fill="both")
        # State
        self.current_image = None
        self.current_json_path = None
        self.is_viewing = False
        self.scale_factor = 1.0        # To track resizing
        self.orig_size = (1, 1)        # To track original dimensions
        # Input Bindings
        self.root.bind('<Return>', self.on_enter)
        self.root.bind('e', self.on_edit)
        self.root.bind('o', self.on_open_pdf) # <--- 3. Add Key Binding
        self.root.bind('<Escape>', lambda e: self.root.quit())
        self.label.bind('<Button-1>', self.on_click) # Bind left mouse click
        # Start polling queue
        self.poll_queue()
    def poll_queue(self):
        if not self.is_viewing:
            try:
                pil_image, json_path = image_queue.get_nowait()
                if pil_image is None:
                    print("All images processed.")
                    self.root.quit() # Stop the program
                    return
                self.display_image(pil_image, json_path)
            except queue.Empty:
                pass
        self.root.after(100, self.poll_queue)
    def on_open_pdf(self, event):
        if self.is_viewing and self.current_json_path:
            # Replace .json extension with .pdf
            pdf_path = self.current_json_path.with_suffix(".pdf")
            print(f"Opening {pdf_path}")
            # Use subprocess to run xdg-open without blocking
            subprocess.Popen(['xdg-open', str(pdf_path)])
    def display_image(self, pil_image, json_path):
        self.orig_size = pil_image.size
        self.scale_factor = 1.0
        # Resize if too large for screen
        screen_h = self.root.winfo_screenheight() - 100
        if pil_image.height > screen_h:
            self.scale_factor = screen_h / pil_image.height
            pil_image = pil_image.resize((int(pil_image.width * self.scale_factor),
                                          int(pil_image.height * self.scale_factor)))
        self.tk_image = ImageTk.PhotoImage(pil_image)
        self.label.config(image=self.tk_image, text="")
        self.current_json_path = json_path
        self.is_viewing = True
        self.root.lift()
    def on_enter(self, event):
        if self.is_viewing:
            print("Next...")
            self.is_viewing = False
            self.label.config(image="", text="Loading next...")
    def on_edit(self, event):
        if self.is_viewing and self.current_json_path:
            print(f"Opening {self.current_json_path}")
            subprocess.Popen(['xdg-open', str(self.current_json_path)])
    def on_click(self, event):
        if not self.is_viewing: return
        # Map click to original image coordinates
        x = int(event.x / self.scale_factor)
        y = int(event.y / self.scale_factor)
        w, h = self.orig_size
        # Create 10px box (5px radius)
        # Coordinate format: [y_min, x_min, y_max, x_max] (0-1000 scale)
        box = [
            int(max(0, y - 5) / h * 1000),
            int(max(0, x - 5) / (w- padding) * 1000),
            int(min(h, y + 5) / h * 1000),
            int(min(w, x + 5) / (w - padding) * 1000),
        ]
        box_str = "{ \"box_2d\": " + str(box) + ", \"label\": \"\" },"
        print(f"Copied box at ({x},{y}): {box_str}")
        self.root.clipboard_clear()
        self.root.clipboard_append(box_str)
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python plotting_gui.py <directory_or_file>")
        sys.exit(1)
    input_path = Path(sys.argv[1])
    files_to_process = []
    if input_path.is_file():
        # File mode
        base_dir = input_path.parent
        stem = input_path.stem
        # Try to locate the image in Cutleft directory
        img_path = base_dir / "Cutleft" / f"{stem}.jpg"
        # Fallback: Check if user provided the jpg inside Cutleft directly
        if not img_path.exists() and input_path.parent.name == "Cutleft" and input_path.suffix.lower() == ".jpg":
             base_dir = input_path.parent.parent
             img_path = input_path
        if not img_path.exists():
            print(f"Error: Could not find image at {img_path}")
            sys.exit(1)
        files_to_process = [img_path]
    else:
        # Directory mode
        base_dir = input_path
        cutleft_dir = base_dir / "Cutleft"
        if not cutleft_dir.exists():
            print(f"Error: {cutleft_dir} does not exist.")
            sys.exit(1)
        files_to_process = sorted(cutleft_dir.glob("*.jpg"))
    try:
        all_labels = list(filter(None, (base_dir / "labels").read_text().splitlines()))
    except FileNotFoundError:
        all_labels = []
    # Start Processing Thread
    t = threading.Thread(target=worker_thread, args=(base_dir, files_to_process, all_labels))
    t.daemon = True # Kill thread if main app closes
    t.start()
    # Start GUI
    root = tk.Tk()
    app = ImageViewer(root, base_dir)
    root.mainloop()
--- a/rename_to_copie.sh
+++ b/rename_to_copie.sh
@ -0,0 +1,20 @@
 #!/bin/bash
 # Ensure a directory is provided
 if [ ! -d "$1" ]; then
    echo "Usage: $0 <directory_path>"
    exit 1
 fi
 # Go to the directory
 cd "$1" || exit
 count=1
 for file in *.pdf; do
    # Handle case where no pdfs exist
    [ -e "$file" ] || continue
    # Rename with 0-padding (e.g., Copie01.pdf)
    mv -- "$file" "$(printf "Copie%02d.pdf" "$count")"
    ((count++))
 done
--- a/rotate_all.sh
+++ b/rotate_all.sh
@ -0,0 +1,25 @@
 #!/bin/bash
 # Check if an argument is provided
 if [ -z "$1" ]; then
    echo "Usage: $0 <directory>"
    exit 1
 fi
 # Try to change into the directory, exit on failure
 cd "$1" || { echo "Error: Cannot access directory '$1'"; exit 1; }
 # Enable nullglob: if no pdfs exist, the loop won't run once with "*.pdf"
 shopt -s nullglob
 for file in *.pdf; do
    # Rotate to a temporary file
    if qpdf --rotate=+180 "$file" "temp_rotated.pdf"; then
        mv "temp_rotated.pdf" "$file"
        echo "Rotated: $file"
    else
        echo "Error processing: $file"
        # Clean up temp file if pdftk failed but created garbage
        [ -f "temp_rotated.pdf" ] && rm "temp_rotated.pdf"
    fi
 done
--- a/splitting_int.py
+++ b/splitting_int.py
@ -0,0 +1,202 @@
 import fitz  # PyMuPDF
 from pypdf import PdfWriter
 from pypdf import PdfReader
 import os
 import sys
 import json
 import shutil
 from pathlib import Path
 from collections import defaultdict  # Added for grouping
 # input_pdf = "Une Interro/Split.pdf"
 def decode_json(pdf_file):
    file_path = Path(pdf_file)
    # Load JSON content from File.json
    with open(file_path.with_suffix(".json"), "r") as f:
        json_result = json.load(f)
    # Get number of pages from File.pdf
    nb_pages = len(PdfReader(file_path).pages)
    bb_list = json_result["list"]
    name = json_result["name"]
    column_width = 1000 // nb_pages
    def page_number(b):
        return ((b[1] + b[3]) // 2) // column_width
    result = [] # contient la page, et, en millième, au dessus du label
                # (marge en plus), et au dessus du label (marge en moins)
    for d in bb_list:
        (b, label) = d["box_2d"], d["label"]
        # print(b)
        pn = page_number(b)
        # 38 carreaux dans une page
        carreau = 1000 // 38
        # result.append((label, pn, b[2] - 3 * carreau, b[2] + int(carreau)))
        result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
    result.sort(key=lambda x: (x[1], x[2]))
    return (name, result)
 def split_an_interro(base_dir,input_pdf, coords_list):
    doc = fitz.open(input_pdf)
    output_dir = base_dir / input_pdf.stem
    generated_files = set()
    # Dictionary to collect parts for each label
    parts_by_label = defaultdict(list)
    # Filter coords_list to remove consecutive duplicate labels.
    # If a label appears at the end of a page and again at the start of the next,
    # we want to treat it as one continuous block, not two separate cuts.
    filtered_coords = []
    if coords_list:
        filtered_coords.append(coords_list[0])
        for item in coords_list[1:]:
            # item[0] is the label/title
            if item[0] != filtered_coords[-1][0]:
                filtered_coords.append(item)
    coords_list = filtered_coords
    def scale_coord(y, page):
        """Scale y from 0–1000 range to PDF points."""
        page_height = page.rect.height
        return (y / 1000) * page_height
    def save_cropped_page(doc, page_num, y0, y1, out_path):
        """
        Saves a cropped portion of a page as a new PDF,
        correctly handling the original page's rotation.
        """
        # print(f"Saving cropped_page with : {y0} and {y1}")
        # Get the source page object
        page = doc[page_num]
        # print("Debug : ", page_num, y0, y1, output_dir)
        # 1. Define the crop rectangle in the VISUAL (rotated) coordinate system.
        # The page.rect gives unrotated dimensions, so we apply the transformation
        # matrix to get the visual dimensions.
        rotated_rect = page.rect * page.transformation_matrix
        visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
        # 2. Transform this visual crop rectangle back into the UNROTATED system.
        # The 'clip' argument for show_pdf_page requires unrotated coordinates.
        # The derotation_matrix does this conversion for us.
        unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
        # Create a new temporary document for the output
        temp_doc = fitz.open()
        # Create a new page with the dimensions of our visual crop
        temp_page = temp_doc.new_page(
            width=visual_crop_rect.width,
            height=visual_crop_rect.height
        )
        # Display the cropped and de-rotated content on the new page
        temp_page.show_pdf_page(
            temp_page.rect,      # Where to place the content on the new page (the whole page)
            doc,                 # Source document
            page_num,
            rotate=-page.rotation,  # Cancel the original page's rotation
            clip=unrotated_clip_rect  # The area to take from the source page
        )
        # Save the new one-page PDF and close the document
        temp_doc.save(out_path)
        temp_doc.close()
    for idx, (title, page_nb, ymin, _) in enumerate(coords_list):
        temp_parts = []
        y_start = scale_coord(ymin, doc[page_nb])
        if idx + 1 < len(coords_list):
            _, next_page_nb, _, next_ymax = coords_list[idx + 1]
            if next_page_nb == page_nb:
                # Same page
                y_end = scale_coord(next_ymax, doc[page_nb])
                temp_path = f"_part_{idx}_0.pdf"
                save_cropped_page(doc, page_nb, y_start, y_end, temp_path)
                temp_parts.append(temp_path)
            else:
                # Current page part
                temp_path1 = f"_part_{idx}_0.pdf"
                save_cropped_page(doc, page_nb, y_start, doc[page_nb].rect.height, temp_path1)
                temp_parts.append(temp_path1)
                # Next page part
                y_end_next = scale_coord(next_ymax, doc[next_page_nb])
                temp_path2 = f"_part_{idx}_1.pdf"
                if y_end_next >= 10:
                    save_cropped_page(doc, next_page_nb, 0, y_end_next, temp_path2)
                    temp_parts.append(temp_path2)
        else:
            # Last segment to end of page
            temp_path = f"_part_{idx}_0.pdf"
            save_cropped_page(doc, page_nb, y_start, doc[page_nb].rect.height, temp_path)
            temp_parts.append(temp_path)
        # Collect parts for this label instead of writing immediately
        parts_by_label[title].extend(temp_parts)
    output_dir.mkdir(parents=True, exist_ok=True)
    # Process aggregated parts by label
    for title, parts in parts_by_label.items():
        merger = PdfWriter()
        for part in parts:
            merger.append(part)
        filename = f"{title}.pdf"
        merger.write(output_dir / filename)
        merger.close()
        generated_files.add(filename)
        # Cleanup temporary files for this label
        for part in parts:
            if os.path.exists(part):
                os.remove(part)
    doc.close()
    # --- Cleanup Logic ---
    # Move files not generated in this run to 'Missing' folder
    if output_dir.exists():
        missing_dir = output_dir / "Missing"
        for item in output_dir.iterdir():
            if item.is_file() and item.name not in generated_files:
                print(f"ALERT: File '{item.name}' in '{input_pdf.stem}' was not generated. Moving to {missing_dir}")
                missing_dir.mkdir(exist_ok=True)
                item.rename(missing_dir / item.name)
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python scrit.py <directory or pdf_file>")
        sys.exit(1)
    input_arg = Path(sys.argv[1])
    if input_arg.is_file():
        # If a single file is provided, process only that file.
        # base_dir is assumed to be the directory containing the file.
        base_dir = input_arg.parent
        pdf_files = [input_arg]
    elif input_arg.is_dir():
        # If a directory is provided, process all PDFs inside.
        base_dir = input_arg
        pdf_files = sorted(base_dir.glob("*.pdf"))
    else:
        print(f"Error: {input_arg} is not a valid file or directory.")
        sys.exit(1)
    for pdf_path in pdf_files:
            json_path = pdf_path.with_suffix(".json")
            if json_path.exists():
                (name, coords) = decode_json(pdf_path)
                print("Decoded name : ", name)
                split_an_interro(base_dir, pdf_path, coords)
            else:
                print(f"Warning: No JSON found for {pdf_path.name} at {json_path}")