120 lines
3.8 KiB
Python
120 lines
3.8 KiB
Python
from google import genai
|
|
from google.genai import types
|
|
import base64
|
|
from pathlib import Path
|
|
|
|
MODEL_ID = "gemini-3-flash-preview"
|
|
api_key="REMOVED_API_KEY"
|
|
|
|
my_prompt = """I'm giving you an image of the left columns of a written exam.
|
|
Students answer several exercises, which can have several questions.
|
|
|
|
The image consists of several columns, separated by vertical black
|
|
lines. The image should be read top to bottom and then left to right,
|
|
meaning first column, then second column, etc.
|
|
|
|
In their sheet, students delimit exercises and questions using
|
|
delimiters such as `Ex 1`, or `Exercice 1`, and `1)` or `a)`. You need
|
|
to give me the bounding boxes of each delimiter.
|
|
|
|
When giving the bounding box of the first question of an exercise, the
|
|
box should be large enough to contain both the exercice label
|
|
(`Exercice i`) and the question label (`1)`) parts.
|
|
|
|
You also need to give me the student name. It should appear on the top
|
|
left of the image. Disregard any mention of `MPSI 3`, it is their
|
|
class. A list of possible student names will be given below.
|
|
|
|
You will answer with a JSON object, containing a `name` field with the
|
|
name, and a `list` field, with the list of the bounding boxes and
|
|
their labels. The box_2d should be [ymin, xmin, ymax, xmax] normalized
|
|
to 0-1000.
|
|
|
|
Here is an example :
|
|
{\"name\" : \"John Doe\", \"list\" : [{\"box_2d\": (10, 20, 30, 40), \"label\" : \"Ex 1 : 1)\"}]}
|
|
|
|
Do not provide a box_2d for the name. Only for the labels.
|
|
|
|
You may find the same label present several times, as a student either
|
|
recall the current label on a new page, or adds content to its answer
|
|
later on. Give the position of each instance of each label.
|
|
|
|
For this exam you should look for the labels given below, separated by
|
|
newlines. A student need not have answered every question, so some may
|
|
be missing.
|
|
|
|
##labels##
|
|
|
|
Here's a list of the names of the students, pick the one that matches
|
|
the best or `\"Unknown\"` if you cannot read the name
|
|
|
|
##names##"""
|
|
|
|
from pydantic import BaseModel, Field
|
|
from typing import List
|
|
|
|
class BoxItem(BaseModel):
|
|
box_2d: List[int] = Field(description="Bounding box coordinates (e.g., [ymin, xmin, ymax, xmax])")
|
|
label: str = Field(description="The label associated with the specific box")
|
|
|
|
class AnnotationData(BaseModel):
|
|
name: str = Field(description="The name identifier")
|
|
list: List[BoxItem] = Field(description="List of bounding box items")
|
|
|
|
|
|
def generate_request(file, labels):
|
|
"""Generates request for Gemini."""
|
|
|
|
image_path = Path(file)
|
|
|
|
contents = [
|
|
types.Content(
|
|
role="user",
|
|
parts=[
|
|
types.Part.from_bytes(
|
|
data=image_path.read_bytes(),
|
|
mime_type="image/jpeg"
|
|
),
|
|
types.Part.from_text(text=my_prompt + labels),
|
|
],
|
|
)
|
|
]
|
|
|
|
generate_content_config = types.GenerateContentConfig(
|
|
temperature=1.0,
|
|
top_p=0.95,
|
|
seed=0,
|
|
max_output_tokens=65535,
|
|
response_mime_type= "application/json",
|
|
response_json_schema= AnnotationData.model_json_schema(),
|
|
# Thinking config is not compatible with response_json ? Unsure.
|
|
# thinking_config=types.ThinkingConfig(
|
|
# thinking_budget=-1,
|
|
# ),
|
|
# thinking_config=types.ThinkingConfig(
|
|
# include_thoughts=True,
|
|
# thinking_budget=1024, # Optimized for Gemini 3 capabilities
|
|
# ),
|
|
)
|
|
return (contents, generate_content_config)
|
|
|
|
import sys
|
|
import os
|
|
import time
|
|
|
|
if len(sys.argv) < 2:
|
|
sys.exit("Usage: python script.py Staging/cutleft1000.jpg labels")
|
|
|
|
INPUT_FILE = sys.argv[1]
|
|
contents, config = generate_request(INPUT_FILE)
|
|
|
|
client = genai.Client(api_key=api_key)
|
|
|
|
for chunk in client.models.generate_content_stream(
|
|
model=MODEL_ID,
|
|
contents=contents,
|
|
config=config,
|
|
):
|
|
if chunk.text:
|
|
print(chunk.text, end="", flush=True)
|