Copies/annotating.py

import sys
import os
import json
import glob
from pathlib import Path
import subprocess
from PIL import Image

MARGIN_LEFT = 300
ANNOT_WIDTH = 600

# Results is : Copie id -> label -> {pdf_path, gemini_result, coordinates}
# Coordinates are the real coordinates (hmin, hmax) of the image in the Group
# The gemini_result coordinates should be un-normalized !
def make_dictionary(root_dir):
    correction_path = os.path.join(root_dir, "correction.json")

    # Load correction data
    try:
        with open(correction_path, 'r', encoding='utf-8') as f:
            corrections = json.load(f)
    except FileNotFoundError:
        print(f"Error: {correction_path} not found.")
        sys.exit(1)

    # Dictionary: keys are IDs
    result_data = {}

    # Iterate through labels and items in correction.json
    for label, items in corrections.items():
        items = sum(items, []) # Flatten
        for item in items:
            # print(item)
            student_id = item['id']
            result_obj = item['result']

            # Find coordinates
            coordinates = None
            height,width= None, None
            label_dir = os.path.join(root_dir, label)

            # Search all json files in Dir/label
            json_files = glob.glob(os.path.join(label_dir, "*.json"))
            for jf in json_files:
                try:
                    with open(jf, 'r', encoding='utf-8') as f:
                        coord_list = json.load(f)
                        # Format: [["id", x, y, width_r, "label"], ...]
                        for entry in coord_list:
                            if entry[0] == student_id:
                                coordinates = (entry[1], entry[2])
                                img_path = os.path.splitext(jf)[0] + ".jpg"
                                with Image.open(img_path) as img:
                                    width, height = img.size
                                break
                except json.JSONDecodeError:
                    continue
                if coordinates:
                    break

            # Construct PDF path: Dir/Copie{id}/{label}.pdf
            pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf")

            # Initialize dictionary structure for this ID if missing
            if student_id not in result_data:
                result_data[student_id] = {}

            fb = result_obj.get("feedback", [])
            for i in range(len(fb)):
                el = fb[i]
                if "box_2d" in el and el["box_2d"]:
                    el["box_2d"][0] = (el["box_2d"][0] * height)//1000
                    el["box_2d"][2] = (el["box_2d"][2] * height)//1000
                    el["box_2d"][1] = (el["box_2d"][1] * width)//1000
                    el["box_2d"][3] = (el["box_2d"][3] * width)//1000

            # Populate the object
            result_data[student_id][label] = {
                "pdf_path": pdf_path,
                "result": result_obj,
                "coordinates": coordinates
            }

    return result_data

def make_base_image(pdf_path):
    pages = convert_from_path(pdf_path)

    # Calculate total dimensions
    total_h = sum(page.height for page in pages)
    max_w = max(page.width for page in pages)

    # Create concatenated base image
    base_img = Image.new("RGBA", (max_w, total_h), "white")

    current_y = 0
    for page in pages:
        base_img.paste(page.convert("RGBA"), (0, current_y))
        current_y += page.height
    return (base_img, total_h, max_w)

import io
import shutil
from pdf2image import convert_from_path
from PIL import Image, ImageDraw, ImageFont
import matplotlib
matplotlib.use('Agg')  # Force headless rendering
import matplotlib.pyplot as plt

# plt.rcParams.update({ "text.usetex": True,
                     # "text.latex.preamble": r"\usepackage{bbold}"})

import re
import textwrap

def normalize_mathtext(text):
    """
    Replaces LaTeX shortcuts not supported by Matplotlib's mathtext parser.
    e.g. \\le -> \\leq, \\ge -> \\geq
    Using lookahead (?![a-zA-Z]) prevents replacing \\left with \\leqft.
    """
    text = re.sub(r'\\le(?![a-zA-Z])', r'\\leq', text)
    text = re.sub(r'\\ge(?![a-zA-Z])', r'\\geq', text)
    text = re.sub(r'\\implies', r'\\Rightarrow', text)
    # Sometimes, Gemini escapes too much ? Not sure
    text = text.replace("\\\\", "\\")
    text = text.replace("\\llbracket", "[\\![")
    text = text.replace("\\rrbracket", "]\\!]")
    text = text.replace("\\R", "\\mathbb{R}")
    text = text.replace("\\N", "\\mathbb{N}")
    text = text.replace("\\Z", "\\mathbb{Z}")
    text = text.replace("\\C", "\\mathbb{C}")
    text = text.replace("\\Q", "\\mathbb{Q}")
    # Sometimes, Gemini doesn't escape enough. In the json, you should have \\f
    text = text.replace('\f', r'\f')
    text = re.sub('\u0010', "", text)
    return text

def wrap_latex_text(text, width_chars):
    """
    Wraps text but keeps LaTeX math blocks ($...$) intact.
    """
    # 1. Split text into chunks of: text, math, text, math...
    # The regex looks for $...$ (non-greedy).
    parts = re.split(r'(\$[^\$]+\$)', text)

    # 2. Tokenize: Break plain text by spaces, keep math blocks whole.
    tokens = []
    for part in parts:
        if part.startswith('$') and part.endswith('$'):
            tokens.append(part) # Keep math block distinct
        else:
            tokens.extend(part.split()) # Split normal text by whitespace

    # 3. Reconstruct lines using textwrap logic
    lines = []
    current_line = []
    current_length = 0

    for token in tokens:
        # +1 for the space we will add
        token_len = len(token)

        if current_length + token_len + 1 > width_chars:
            lines.append(" ".join(current_line))
            current_line = [token]
            current_length = token_len
        else:
            current_line.append(token)
            current_length += token_len + 1

    if current_line:
        lines.append(" ".join(current_line))

    res =  "\n".join(lines)
    return res

def render_latex_text(text, width_px, bg_color=(255, 255, 255, 255), max_lines=None,
                      fontsize=14):
    # 1. Fix unsupported symbols
    text = normalize_mathtext(text)

    dpi = 100
    fig_width = width_px / dpi

    # Estimate characters per line based on width and font size (heuristic)
    # FontSize 12 approx 0.5 inches wide for ~15 chars usually,
    # but let's approximate: Width (inches) * ~10 chars/inch for size 12
    chars_per_line = int(fig_width * 10)

    # Pre-wrap the text respecting LaTeX boundaries
    wrapped_text = wrap_latex_text(text, chars_per_line)

    # Dynamic height based on actual number of lines
    num_lines = wrapped_text.count('\n') + 1
    if max_lines and num_lines > max_lines:
        # logic to truncate if strictly necessary, or just expand
        pass

    # 0.3 inches per line buffer
    fig_height = num_lines * 0.3 + 0.2

    fig = plt.figure(figsize=(fig_width, fig_height), dpi=dpi)

    # NOTE: wrap=False because we did it ourselves
    plt.text(0.01, 0.95, wrapped_text, fontsize=fontsize,
             verticalalignment='top', horizontalalignment='left',
             wrap=False)

    plt.axis('off')

    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.1, transparent=True)
    plt.close(fig)
    buf.seek(0)

    img = Image.open(buf).convert("RGBA")

    # Create background
    final_img = Image.new("RGBA", img.size, bg_color)
    final_img.alpha_composite(img)
    return final_img

import os
import tempfile
import subprocess
import PIL.ImageOps


def render_real_latex_text(text, width_px, bg_color=(255, 255, 255, 255), max_lines=None, fontsize=19):
    dpi = 100
    width_in = width_px / dpi
    line_spacing = int(fontsize * 1.2)

    # Use the 'standalone' class with 'varwidth' to auto-crop height while restricting width
    latex_template = f"""\\documentclass[varwidth={width_in}in,margin=0.2cm]{{standalone}}
\\usepackage[utf8]{{inputenc}}
\\usepackage[T1]{{fontenc}}
\\usepackage{{lmodern}}  % Enables arbitrary font scaling
\\usepackage{{amsmath, amssymb}}
\\usepackage{{commands}}
%\\usepackage{{anyfontsize}} % replaced by lmodern
\\begin{{document}}
\\fontsize{{{fontsize}}}{{{line_spacing}}}\\selectfont
{text}
\\end{{document}}
"""

    with tempfile.TemporaryDirectory() as temp_dir:
        tex_path = os.path.join(temp_dir, 'text.tex')
        pdf_path = os.path.join(temp_dir, 'text.pdf')

        with open(tex_path, 'w', encoding='utf-8') as f:
            f.write(latex_template)

        # Compile to PDF
        result = subprocess.run(
            ['pdflatex', '-interaction=nonstopmode', 'text.tex'],
            cwd=temp_dir,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL
        )

        if not os.path.exists(pdf_path):
            raise RuntimeError("LaTeX compilation failed. Check your LaTeX syntax.")

        # Convert PDF to grayscale (ignoring pdf2image's broken transparency)
        images = convert_from_path(pdf_path, dpi=dpi)
        gray_img = images[0].convert("L")

    # 1. Invert grayscale to create an alpha mask (white bg = 0, black text = 255)
    alpha_mask = PIL.ImageOps.invert(gray_img)

    # 2. Create a transparent image with black text using the mask
    text_img = Image.new("RGBA", gray_img.size, (0, 0, 0, 255))
    text_img.putalpha(alpha_mask)

    # 3. Create the requested background and composite the text over it
    final_img = Image.new("RGBA", text_img.size, bg_color)
    final_img.alpha_composite(text_img)

    # (Optional) Truncate image height if max_lines is strictly enforced
    if max_lines:
        max_height_px = int((fontsize * 1.2 / 72.0) * dpi * max_lines) # Points to pixels
        if final_img.height > max_height_px:
            final_img = final_img.crop((0, 0, final_img.width, max_height_px))

    return final_img

import io
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from highlight_text import ax_text

def color(score):
    t = max(0.0, min(1.0, float(score) / 4.0))
    t = t*1.5 - 0.25
    t = max(0.0, min(1.0, t))
    red = 200 * (1 - t)
    green = 150 * t
    return mcolors.to_hex((red/255, green/255, 0))

def render_score_text(label, score, error, width_px, fontsize=18,
                      bg_color=(255, 255, 255, 255),
                      with_error=True, id=None):
    # 2. Build highlight-text String & Properties
    # Wrap colored parts in < >
    score_str = f"{label}    Note : <{score}>"
    hl_props = [{"color": color(score), "fontweight": "bold"}]

    if error and error != "null" and with_error:
        score_str += f"   <{error}>"
        hl_props.append({"color": "orange", "fontweight": "bold"})

    if id:
        score_str = f"{id}  " + score_str

    # 3. Wrap Text
    dpi = 100
    fig_width = width_px / dpi
    chars_per_line = int(fig_width * 10)

    # fig_height = 0.4 + 0.2
    fig_height = 0.8

    fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=dpi)
    ax.axis('off')

    # Replaces plt.text
    ax_text(0.02, 0.98, score_str,
            fontsize=fontsize,
            verticalalignment='top',
            horizontalalignment='left',
            highlight_textprops=hl_props,
            ax=ax)

    buf = io.BytesIO()
    # Issues with tight bbox_inches.
    # plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.05, transparent=True)
    plt.savefig(buf, format='png', pad_inches=0.05, transparent=True)
    plt.close(fig)
    buf.seek(0)

    img = Image.open(buf).convert("RGBA")

    # Apply background
    final_img = Image.new("RGBA", img.size, bg_color)
    final_img.alpha_composite(img)

    return final_img

def compose_label_image(base_img, label, result, hmin,
                        render_fn=render_latex_text,
                        draw_callback=None,
                        with_error=True,
                        with_empty=False,
                        more_right=False,
                        with_id=None):
    """
    Composes the final image with annotations.

    Args:
        base_img: The source PDF converted to image.
        label: Label name (e.g. "Ex1").
        result: The JSON result object (score, feedbacks).
        hmin: Vertical offset coordinate.
        render_fn: Function to render text to image (allows threading injection).
        draw_callback: Optional function(type, draw_obj, position_dict, data_dict)
                       called when elements are placed. Used for checkboxes.
    """
    score = result.get('score', 0)
    error = result.get('error', "")
    feedbacks = result.get('feedback', [])

    if error == "empty-answer" and not with_empty:
        return None, 0


    # Filter deleted items (used by reading_annotations.py)
    feedbacks = [f for f in feedbacks if "to_delete" not in f]

    global_fb = [f for f in feedbacks if not f.get('box_2d')]
    local_fb = [f for f in feedbacks if f.get('box_2d')]
    local_fb.sort(key=lambda x: x['box_2d'][0])

    # 1. Prepare Headers
    header_elements = []

    if more_right:
        width = base_img.width // 2
    else:
        width = base_img.width // 2 - 150
    img_score = render_score_text(label, score, error, width,
                                  fontsize=18, with_error=with_error,
                                  id=with_id)
    header_elements.append({"type": "score", "img": img_score, "data": result})

    # Global Feedbacks
    for idx, fb in enumerate(global_fb):
        img_fb = render_fn(fb['text'], base_img.width)
        header_elements.append({"type": "global_fb", "img": img_fb, "data": fb, "index": idx})

    # Calculate Header Height
    header_height = sum(el["img"].height for el in header_elements)
    total_height = base_img.height + header_height

    # Create Canvas
    final_img = Image.new("RGB", (base_img.width + MARGIN_LEFT, total_height), "white")

    # Draw Headers
    current_y = 0
    draw = ImageDraw.Draw(final_img, "RGBA")

    for el in header_elements:
        if el["type"] == "score" and more_right:
            final_img.paste(el["img"], (150, current_y))
        else:
            final_img.paste(el["img"], (0, current_y))


        if draw_callback:
            # Hook for checkboxes
            draw_callback("header_item", draw,
                          {"x": 0, "y": current_y, "w": el["img"].width, "h": el["img"].height},
                          el)
        current_y += el["img"].height

    # Paste Base Image
    image_offset_y = current_y
    final_img.paste(base_img, (MARGIN_LEFT, image_offset_y))

    # 2. Draw Local Annotations
    draw = ImageDraw.Draw(final_img, "RGBA") # Refresh draw object
    last_text_bottom = 0

    for idx, fb in enumerate(local_fb):
        box = fb.get('box_2d')
        ymin, xmin, ymax, xmax = box

        target_ymin = (ymin - hmin) + image_offset_y
        target_ymax = (ymax - hmin) + image_offset_y
        target_xmin = xmin + MARGIN_LEFT
        target_xmax = xmax + MARGIN_LEFT

        # Draw Rectangle (if not suppressed)
        if "norectangle" not in fb:
            draw.rectangle([target_xmin, target_ymin, target_xmax, target_ymax], outline="red", width=3)

            if draw_callback:
                 draw_callback("local_rect", draw,
                              {"box": [target_xmin, target_ymin, target_xmax, target_ymax]},
                              {"data": fb, "index": idx})

        # Render Text
        txt_img = render_fn(fb['text'], width_px=ANNOT_WIDTH,
                            bg_color=(255, 200, 200, 180), max_lines=None)

        # Calculate Position
        center_y = (target_ymin + target_ymax) / 2
        paste_y = center_y - (txt_img.height / 2)
        paste_y = max(paste_y, image_offset_y)

        if paste_y < last_text_bottom:
            paste_y = last_text_bottom + 5

        # Resize canvas if needed
        required_height = int(paste_y + txt_img.height + 20)
        if required_height > final_img.height:
            new_final = Image.new("RGB", (final_img.width, required_height), "white")
            new_final.paste(final_img, (0, 0))
            final_img = new_final
            draw = ImageDraw.Draw(final_img, "RGBA")

        # Paste Text
        final_img.paste(txt_img, (10, int(paste_y)), mask=txt_img)

        if draw_callback:
            draw_callback("local_text", draw,
                          {"x": 10, "y": int(paste_y), "w": txt_img.width, "h": txt_img.height},
                          {"data": fb, "index": idx})

        last_text_bottom = paste_y + txt_img.height

    return final_img, header_height

from utils import natural_key
import concurrent.futures

def process_student(student_id, labels_data, root_dir, all_labels, overwrite):
    """Helper function to process a single student."""

    # Prepare output directory: Dir/Anot_CopieID
    output_dir = os.path.join(root_dir, "Anot", f"Copie{student_id}")

    # Check if already processed (Concat.jpg exists)
    concat_path = os.path.join(output_dir, "Concat.jpg")
    if os.path.exists(concat_path) and not overwrite:
        print(f"Skipping Copie {student_id} (Concat.jpg exists)")
        return

    print("Processing :", student_id)

    # Clean folder if re-processing
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)

    d_notes = dict.fromkeys(all_labels, "")
    label_images = []

    sorted_labels = sorted(list(labels_data.items()), key=natural_key)

    for label, content in sorted_labels:
        # 1. Find PDF path
        copie_folder = f"Copie{student_id}"
        pdf_rel_path = os.path.join(copie_folder, f"{label}.pdf")
        pdf_full_path = os.path.join(root_dir, pdf_rel_path)

        if not os.path.exists(pdf_full_path):
            print(f"File not found: {pdf_full_path}")
            continue

        # 2. Convert PDF to Image
        try:
            (base_img, _, _) = make_base_image(pdf_full_path)
        except Exception as e:
            print(f"Error converting {pdf_full_path}: {e}")
            continue

        result = content.get('result', {})
        coordinates = content.get('coordinates', (0, 0))  # (hmin, hmax)
        score = result.get('score', 0)
        d_notes[label] = str(score)

        final_img, _ = compose_label_image(base_img, label, result, coordinates[0],
                                           with_empty=True,
                                           render_fn=render_real_latex_text)
        # 7. Save Image
        save_path = os.path.join(output_dir, f"{label}.jpg")
        final_img.save(save_path)
        if result.get('error', "") != "empty-answer":
            label_images.append(final_img)

    # Save scores
    with open(os.path.join(output_dir, "score.json"), "w") as f:
        json.dump(d_notes, f, indent=4)

    # Concatenate
    if label_images:
        max_w = max(i.width for i in label_images)
        total_h = sum(i.height for i in label_images)
        canvas = Image.new('RGB', (max_w, total_h))
        cy = 0
        for img in label_images:
            canvas.paste(img, (0, cy))
            cy += img.height
        canvas.save(concat_path)


def process_correction(root_dir, data, all_labels, overwrite=False):

    # with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    #     # Create a list of futures
    #     futures = []
    #     for student_id, labels in sorted(data.items()):
    #         futures.append(
    #             executor.submit(process_student, student_id, labels, root_dir, all_labels, overwrite)
    #         )

    #     # Wait for all threads to complete
    #     concurrent.futures.wait(futures)

    # Ne pas thread cette applications
    #  1. Il faut protéger les appels à matplotlib
    #  2. tu vas perdre les erreurs
    for student_id, labels in sorted(data.items()):
        process_student(student_id, labels, root_dir, all_labels, overwrite)

import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Annotate copies")
    parser.add_argument("root_dir", help="Directory containing the copies")
    parser.add_argument("--overwrite", action="store_true", help="Reprocess even if Concat.jpg exists")

    args = parser.parse_args()
    root_dir = args.root_dir
    labels = list(filter(None, (Path(root_dir) / "labels").read_text().splitlines()))
    results = make_dictionary(root_dir)
    # Results is : Copie id -> label -> {pdf_path, gemini_result, coordinates}
    # Coordinates are the real coordinates (hmin, hmax) of the image in the Group
    # print(results,"\n\n\n")
    process_correction(root_dir, results, labels,overwrite=args.overwrite)