import os import sys import argparse from pathlib import Path from pydantic import BaseModel, Field from typing import List from google import genai from google.genai import types MODEL_ID = "gemini-3-flash-preview" api_key = os.environ.get("GEMINI_API_KEY") class QuestionItem(BaseModel): label: str = Field(description="The unique label of the question (e.g., '1.a', 'Exercice 1')") question_content: str = Field(description="The source text of the question, strictly extracted from the enonce file, EXCLUDING the label itself.") solution_content: str = Field(description="The source text of the solution, strictly extracted from the correction file.") class ExamExtraction(BaseModel): questions: List[QuestionItem] PROMPT = """I am providing: 1. A PDF of an exam (`enonce.pdf`) 2. The source code of the exam questions (`enonce` file) 3. The source code of the exam solutions (`correction` file) Your task: 1. Identify all distinct question labels using the PDF document. These labels should be unique : use `Ex 1 : 1)a)` or `I)1)b)`. 2. For each label, extract its exact corresponding question text from the `enonce` source file. Do not include the label itself in this extracted text (nor LaTeX like `item` nor org-mode list labelling like `2.`). 3. For each label, extract its exact corresponding solution textual content from the `correction` source file. Return the result as a JSON list in the exact reading order of the document. """ def find_file(folder: Path, base_name: str) -> Path: for ext in [".org", ".tex"]: path = folder / f"{base_name}{ext}" if path.is_file(): return path return None def process_exam(folder_path: str): folder = Path(folder_path) # 1. Resolve files pdf_path = folder / "enonce.pdf" enonce_path = find_file(folder, "enonce") correction_path = find_file(folder, "correction") missing = [] if not pdf_path.is_file(): missing.append("enonce.pdf") if not enonce_path: missing.append("enonce.org or enonce.tex") if not correction_path: missing.append("correction.org or correction.tex") if missing: print(f"Error: Missing files in {folder}: {', '.join(missing)}") sys.exit(1) print("Reading files...") pdf_bytes = pdf_path.read_bytes() enonce_text = enonce_path.read_text(encoding="utf-8") correction_text = correction_path.read_text(encoding="utf-8") client = genai.Client(api_key=api_key) contents = [ types.Content( role="user", parts=[ types.Part.from_text(text=PROMPT), types.Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"), types.Part.from_text(text=f"--- ENONCE SOURCE ({enonce_path.name}) ---\n{enonce_text}"), types.Part.from_text(text=f"--- CORRECTION SOURCE ({correction_path.name}) ---\n{correction_text}"), ], ) ] config = types.GenerateContentConfig( temperature=0.1, response_mime_type="application/json", response_json_schema=ExamExtraction.model_json_schema(), ) print("Sending request to Gemini...") response = client.models.generate_content( model=MODEL_ID, contents=contents, config=config ) extracted_data = ExamExtraction.model_validate_json(response.text) # 2. Setup output directories text_dir = folder / "Text" sol_dir = folder / "Sol" text_dir.mkdir(exist_ok=True) sol_dir.mkdir(exist_ok=True) labels_file = folder / "labels" print("Writing files...") with open(labels_file, "w", encoding="utf-8") as flabels: for q in extracted_data.questions: # Sanitize label for filesystem (prevent directory traversal if label contains '/') safe_label = q.label.replace("/", "_") flabels.write(f"{safe_label}\n") # Fix double-escaped newlines q_content = q.question_content.replace("\\n", "\n") s_content = q.solution_content.replace("\\n", "\n") # Write Text/label with open(text_dir / safe_label, "w", encoding="utf-8") as f: f.write(f"{q.label}\n{q.question_content}") # Write Sol/label with open(sol_dir / safe_label, "w", encoding="utf-8") as f: f.write(f"{q.label}\n{q.solution_content}") print(f"Success! Processed {len(extracted_data.questions)} questions.") if __name__ == "__main__": if not api_key: print("Error: GEMINI_API_KEY environment variable is not set.") sys.exit(1) parser = argparse.ArgumentParser(description="Extract exam and solution code via Gemini.") parser.add_argument("folder", help="Directory containing the exam files") args = parser.parse_args() process_exam(args.folder)