137 lines
4.8 KiB
Python
137 lines
4.8 KiB
Python
import os
|
|
import sys
|
|
import argparse
|
|
from pathlib import Path
|
|
from pydantic import BaseModel, Field
|
|
from typing import List
|
|
from google import genai
|
|
from google.genai import types
|
|
|
|
MODEL_ID = "gemini-3-flash-preview"
|
|
api_key = os.environ.get("GEMINI_API_KEY")
|
|
|
|
class QuestionItem(BaseModel):
|
|
label: str = Field(description="The unique label of the question (e.g., '1.a', 'Exercice 1')")
|
|
question_content: str = Field(description="The source text of the question, strictly extracted from the enonce file, EXCLUDING the label itself.")
|
|
solution_content: str = Field(description="The source text of the solution, strictly extracted from the correction file.")
|
|
|
|
class ExamExtraction(BaseModel):
|
|
questions: List[QuestionItem]
|
|
|
|
PROMPT = """I am providing:
|
|
1. A PDF of an exam (`enonce.pdf`)
|
|
2. The source code of the exam questions (`enonce` file)
|
|
3. The source code of the exam solutions (`correction` file)
|
|
|
|
Your task:
|
|
1. Identify all distinct question labels using the PDF document.
|
|
These labels should be unique : use `Ex 1 : 1)a)` or `I)1)b)`.
|
|
2. For each label, extract its exact corresponding question text
|
|
from the `enonce` source file. Do not include the label itself
|
|
in this extracted text (nor LaTeX like `item` nor org-mode list
|
|
labelling like `2.`).
|
|
3. For each label, extract its exact corresponding solution textual
|
|
content from the `correction` source file. Return the result as
|
|
a JSON list in the exact reading order of the document.
|
|
"""
|
|
|
|
def find_file(folder: Path, base_name: str) -> Path:
|
|
for ext in [".org", ".tex"]:
|
|
path = folder / f"{base_name}{ext}"
|
|
if path.is_file():
|
|
return path
|
|
return None
|
|
|
|
def process_exam(folder_path: str):
|
|
folder = Path(folder_path)
|
|
|
|
# 1. Resolve files
|
|
pdf_path = folder / "enonce.pdf"
|
|
enonce_path = find_file(folder, "enonce")
|
|
correction_path = find_file(folder, "correction")
|
|
|
|
missing = []
|
|
if not pdf_path.is_file(): missing.append("enonce.pdf")
|
|
if not enonce_path: missing.append("enonce.org or enonce.tex")
|
|
if not correction_path: missing.append("correction.org or correction.tex")
|
|
|
|
if missing:
|
|
print(f"Error: Missing files in {folder}: {', '.join(missing)}")
|
|
sys.exit(1)
|
|
|
|
print("Reading files...")
|
|
pdf_bytes = pdf_path.read_bytes()
|
|
enonce_text = enonce_path.read_text(encoding="utf-8")
|
|
correction_text = correction_path.read_text(encoding="utf-8")
|
|
|
|
client = genai.Client(api_key=api_key)
|
|
|
|
contents = [
|
|
types.Content(
|
|
role="user",
|
|
parts=[
|
|
types.Part.from_text(text=PROMPT),
|
|
types.Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
|
|
types.Part.from_text(text=f"--- ENONCE SOURCE ({enonce_path.name}) ---\n{enonce_text}"),
|
|
types.Part.from_text(text=f"--- CORRECTION SOURCE ({correction_path.name}) ---\n{correction_text}"),
|
|
],
|
|
)
|
|
]
|
|
|
|
config = types.GenerateContentConfig(
|
|
temperature=0.1,
|
|
response_mime_type="application/json",
|
|
response_json_schema=ExamExtraction.model_json_schema(),
|
|
)
|
|
|
|
print("Sending request to Gemini...")
|
|
response = client.models.generate_content(
|
|
model=MODEL_ID,
|
|
contents=contents,
|
|
config=config
|
|
)
|
|
|
|
extracted_data = ExamExtraction.model_validate_json(response.text)
|
|
|
|
# 2. Setup output directories
|
|
text_dir = folder / "Text"
|
|
sol_dir = folder / "Sol"
|
|
text_dir.mkdir(exist_ok=True)
|
|
sol_dir.mkdir(exist_ok=True)
|
|
|
|
labels_file = folder / "labels"
|
|
|
|
print("Writing files...")
|
|
with open(labels_file, "w", encoding="utf-8") as flabels:
|
|
for q in extracted_data.questions:
|
|
# Sanitize label for filesystem (prevent directory traversal if label contains '/')
|
|
safe_label = q.label.replace("/", "_")
|
|
|
|
flabels.write(f"{safe_label}\n")
|
|
|
|
# Fix double-escaped newlines
|
|
q_content = q.question_content.replace("\\n", "\n")
|
|
s_content = q.solution_content.replace("\\n", "\n")
|
|
|
|
|
|
# Write Text/label
|
|
with open(text_dir / safe_label, "w", encoding="utf-8") as f:
|
|
f.write(f"{q.label}\n{q.question_content}")
|
|
|
|
# Write Sol/label
|
|
with open(sol_dir / safe_label, "w", encoding="utf-8") as f:
|
|
f.write(f"{q.label}\n{q.solution_content}")
|
|
|
|
print(f"Success! Processed {len(extracted_data.questions)} questions.")
|
|
|
|
if __name__ == "__main__":
|
|
if not api_key:
|
|
print("Error: GEMINI_API_KEY environment variable is not set.")
|
|
sys.exit(1)
|
|
|
|
parser = argparse.ArgumentParser(description="Extract exam and solution code via Gemini.")
|
|
parser.add_argument("folder", help="Directory containing the exam files")
|
|
|
|
args = parser.parse_args()
|
|
process_exam(args.folder)
|