Copies/gemini_for_enonce.py

137 lines
4.8 KiB
Python

import os
import sys
import argparse
from pathlib import Path
from pydantic import BaseModel, Field
from typing import List
from google import genai
from google.genai import types
MODEL_ID = "gemini-3-flash-preview"
api_key = os.environ.get("GEMINI_API_KEY")
class QuestionItem(BaseModel):
label: str = Field(description="The unique label of the question (e.g., '1.a', 'Exercice 1')")
question_content: str = Field(description="The source text of the question, strictly extracted from the enonce file, EXCLUDING the label itself.")
solution_content: str = Field(description="The source text of the solution, strictly extracted from the correction file.")
class ExamExtraction(BaseModel):
questions: List[QuestionItem]
PROMPT = """I am providing:
1. A PDF of an exam (`enonce.pdf`)
2. The source code of the exam questions (`enonce` file)
3. The source code of the exam solutions (`correction` file)
Your task:
1. Identify all distinct question labels using the PDF document.
These labels should be unique : use `Ex 1 : 1)a)` or `I)1)b)`.
2. For each label, extract its exact corresponding question text
from the `enonce` source file. Do not include the label itself
in this extracted text (nor LaTeX like `item` nor org-mode list
labelling like `2.`).
3. For each label, extract its exact corresponding solution textual
content from the `correction` source file. Return the result as
a JSON list in the exact reading order of the document.
"""
def find_file(folder: Path, base_name: str) -> Path:
for ext in [".org", ".tex"]:
path = folder / f"{base_name}{ext}"
if path.is_file():
return path
return None
def process_exam(folder_path: str):
folder = Path(folder_path)
# 1. Resolve files
pdf_path = folder / "enonce.pdf"
enonce_path = find_file(folder, "enonce")
correction_path = find_file(folder, "correction")
missing = []
if not pdf_path.is_file(): missing.append("enonce.pdf")
if not enonce_path: missing.append("enonce.org or enonce.tex")
if not correction_path: missing.append("correction.org or correction.tex")
if missing:
print(f"Error: Missing files in {folder}: {', '.join(missing)}")
sys.exit(1)
print("Reading files...")
pdf_bytes = pdf_path.read_bytes()
enonce_text = enonce_path.read_text(encoding="utf-8")
correction_text = correction_path.read_text(encoding="utf-8")
client = genai.Client(api_key=api_key)
contents = [
types.Content(
role="user",
parts=[
types.Part.from_text(text=PROMPT),
types.Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
types.Part.from_text(text=f"--- ENONCE SOURCE ({enonce_path.name}) ---\n{enonce_text}"),
types.Part.from_text(text=f"--- CORRECTION SOURCE ({correction_path.name}) ---\n{correction_text}"),
],
)
]
config = types.GenerateContentConfig(
temperature=0.1,
response_mime_type="application/json",
response_json_schema=ExamExtraction.model_json_schema(),
)
print("Sending request to Gemini...")
response = client.models.generate_content(
model=MODEL_ID,
contents=contents,
config=config
)
extracted_data = ExamExtraction.model_validate_json(response.text)
# 2. Setup output directories
text_dir = folder / "Text"
sol_dir = folder / "Sol"
text_dir.mkdir(exist_ok=True)
sol_dir.mkdir(exist_ok=True)
labels_file = folder / "labels"
print("Writing files...")
with open(labels_file, "w", encoding="utf-8") as flabels:
for q in extracted_data.questions:
# Sanitize label for filesystem (prevent directory traversal if label contains '/')
safe_label = q.label.replace("/", "_")
flabels.write(f"{safe_label}\n")
# Fix double-escaped newlines
q_content = q.question_content.replace("\\n", "\n")
s_content = q.solution_content.replace("\\n", "\n")
# Write Text/label
with open(text_dir / safe_label, "w", encoding="utf-8") as f:
f.write(f"{q.label}\n{q.question_content}")
# Write Sol/label
with open(sol_dir / safe_label, "w", encoding="utf-8") as f:
f.write(f"{q.label}\n{q.solution_content}")
print(f"Success! Processed {len(extracted_data.questions)} questions.")
if __name__ == "__main__":
if not api_key:
print("Error: GEMINI_API_KEY environment variable is not set.")
sys.exit(1)
parser = argparse.ArgumentParser(description="Extract exam and solution code via Gemini.")
parser.add_argument("folder", help="Directory containing the exam files")
args = parser.parse_args()
process_exam(args.folder)