Copies/gemini_for_enonce.py

import os
import sys
import argparse
from pathlib import Path
from pydantic import BaseModel, Field
from typing import List
from google import genai
from google.genai import types

MODEL_ID = "gemini-3-flash-preview"
api_key = os.environ.get("GEMINI_API_KEY")

class QuestionItem(BaseModel):
    label: str = Field(description="The unique label of the question (e.g., '1.a', 'Exercice 1')")
    question_content: str = Field(description="The source text of the question, strictly extracted from the enonce file, EXCLUDING the label itself.")
    solution_content: str = Field(description="The source text of the solution, strictly extracted from the correction file.")

class ExamExtraction(BaseModel):
    questions: List[QuestionItem]

PROMPT = """I am providing:
1. A PDF of an exam (`enonce.pdf`)
2. The source code of the exam questions (`enonce` file)
3. The source code of the exam solutions (`correction` file)

Your task:
 1. Identify all distinct question labels using the PDF document.
    These labels should be unique : use `Ex 1 : 1)a)` or `I)1)b)`.
 2. For each label, extract its exact corresponding question text
    from the `enonce` source file. Do not include the label itself
    in this extracted text (nor LaTeX like `item` nor org-mode list
    labelling like `2.`).
 3. For each label, extract its exact corresponding solution textual
    content from the `correction` source file. Return the result as
    a JSON list in the exact reading order of the document.
"""

def find_file(folder: Path, base_name: str) -> Path:
    for ext in [".org", ".tex"]:
        path = folder / f"{base_name}{ext}"
        if path.is_file():
            return path
    return None

def process_exam(folder_path: str):
    folder = Path(folder_path)

    # 1. Resolve files
    pdf_path = folder / "enonce.pdf"
    enonce_path = find_file(folder, "enonce")
    correction_path = find_file(folder, "correction")

    missing = []
    if not pdf_path.is_file(): missing.append("enonce.pdf")
    if not enonce_path: missing.append("enonce.org or enonce.tex")
    if not correction_path: missing.append("correction.org or correction.tex")

    if missing:
        print(f"Error: Missing files in {folder}: {', '.join(missing)}")
        sys.exit(1)

    print("Reading files...")
    pdf_bytes = pdf_path.read_bytes()
    enonce_text = enonce_path.read_text(encoding="utf-8")
    correction_text = correction_path.read_text(encoding="utf-8")

    client = genai.Client(api_key=api_key)

    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text=PROMPT),
                types.Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
                types.Part.from_text(text=f"--- ENONCE SOURCE ({enonce_path.name}) ---\n{enonce_text}"),
                types.Part.from_text(text=f"--- CORRECTION SOURCE ({correction_path.name}) ---\n{correction_text}"),
            ],
        )
    ]

    config = types.GenerateContentConfig(
        temperature=0.1,
        response_mime_type="application/json",
        response_json_schema=ExamExtraction.model_json_schema(),
    )

    print("Sending request to Gemini...")
    response = client.models.generate_content(
        model=MODEL_ID,
        contents=contents,
        config=config
    )

    extracted_data = ExamExtraction.model_validate_json(response.text)

    # 2. Setup output directories
    text_dir = folder / "Text"
    sol_dir = folder / "Sol"
    text_dir.mkdir(exist_ok=True)
    sol_dir.mkdir(exist_ok=True)

    labels_file = folder / "labels"

    print("Writing files...")
    with open(labels_file, "w", encoding="utf-8") as flabels:
        for q in extracted_data.questions:
            # Sanitize label for filesystem (prevent directory traversal if label contains '/')
            safe_label = q.label.replace("/", "_")

            flabels.write(f"{safe_label}\n")

            # Fix double-escaped newlines
            q_content = q.question_content.replace("\\n", "\n")
            s_content = q.solution_content.replace("\\n", "\n")


            # Write Text/label
            with open(text_dir / safe_label, "w", encoding="utf-8") as f:
                f.write(f"{q.label}\n{q.question_content}")

            # Write Sol/label
            with open(sol_dir / safe_label, "w", encoding="utf-8") as f:
                f.write(f"{q.label}\n{q.solution_content}")

    print(f"Success! Processed {len(extracted_data.questions)} questions.")

if __name__ == "__main__":
    if not api_key:
        print("Error: GEMINI_API_KEY environment variable is not set.")
        sys.exit(1)

    parser = argparse.ArgumentParser(description="Extract exam and solution code via Gemini.")
    parser.add_argument("folder", help="Directory containing the exam files")

    args = parser.parse_args()
    process_exam(args.folder)