miscs
parent
bc47f81556
commit
dd0d757fc9
|
|
@ -12,8 +12,7 @@ import annotating_with_checks
|
|||
|
||||
from utils import natural_key
|
||||
|
||||
# Roughly 10 A4 pages at 100 DPI
|
||||
MAX_HEIGHT_PX = 20000 # Can be increased by 10%.
|
||||
MAX_HEIGHT_PX = 25000 # Can be increased by 10%.
|
||||
|
||||
def render_item(item):
|
||||
student_id, label, content = item
|
||||
|
|
@ -134,15 +133,32 @@ def main():
|
|||
shutil.rmtree(bgnot_dir)
|
||||
os.makedirs(bgnot_dir, exist_ok=True)
|
||||
|
||||
used_prefixes = set()
|
||||
|
||||
previous_prefix = None
|
||||
for line in lines:
|
||||
labels = [l.strip() for l in line.split(',') if l.strip()]
|
||||
safe_labels = [l.replace(":", "").strip() for l in line.split(',') if l.strip()]
|
||||
if not labels:
|
||||
continue
|
||||
|
||||
prefix = os.path.commonprefix(safe_labels).strip()
|
||||
if not prefix:
|
||||
prefix = "Group"
|
||||
base_prefix = os.path.commonprefix(safe_labels).strip()
|
||||
if not base_prefix:
|
||||
base_prefix = "Group"
|
||||
|
||||
unique_prefix = base_prefix
|
||||
if unique_prefix[-1] == "i":
|
||||
unique_prefix = unique_prefix[:-1]
|
||||
counter = 2
|
||||
while unique_prefix in used_prefixes:
|
||||
unique_prefix = f"{base_prefix}-{counter}"
|
||||
counter += 1
|
||||
if counter == 2 and previous_prefix and previous_prefix in unique_prefix:
|
||||
unique_prefix = f"{previous_prefix}-{counter}"
|
||||
elif counter == 2:
|
||||
previous_prefx = unique_prefix
|
||||
|
||||
used_prefixes.add(unique_prefix)
|
||||
|
||||
items_to_render = []
|
||||
for sid, lbls in results.items():
|
||||
|
|
@ -201,7 +217,7 @@ def main():
|
|||
batches = batches2
|
||||
|
||||
for i, batch in enumerate(batches, 1):
|
||||
save_batch(batch, prefix, i, root_dir, args.overwrite)
|
||||
save_batch(batch, unique_prefix, i, root_dir, args.overwrite)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ parser.add_argument("--refaire", action="store_true",
|
|||
help="Redo specific copies/labels defined in refaire.json")
|
||||
parser.add_argument("--batch", action="store_true",
|
||||
help="Generate a JSONL file of requests to send to the Gemini Batch API")
|
||||
parser.add_argument("--deal-with-batched", type=str, metavar="FILE",
|
||||
parser.add_argument("--deal-with-batched", action="store_true",
|
||||
help="Process a JSONL file containing completed batch results")
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
|
|
@ -236,6 +236,44 @@ class EvaluationEntry(BaseModel):
|
|||
id: str = Field(description="Entry identifier")
|
||||
result: ResultData = Field(description="Result details")
|
||||
|
||||
# These nested definitions do not work with the batch api, unroll them
|
||||
UNROLLED_SCHEMA = {
|
||||
"type": "ARRAY",
|
||||
"items": {
|
||||
"type": "OBJECT",
|
||||
"properties": {
|
||||
"id": {"type": "STRING", "description": "Entry identifier"},
|
||||
"result": {
|
||||
"type": "OBJECT",
|
||||
"properties": {
|
||||
"score": {"type": "NUMBER", "description": "The numeric score"},
|
||||
"confidence": {"type": "NUMBER", "description": "Confidence level"},
|
||||
"error": {"type": "STRING", "description": "Indicates if an error occurred"},
|
||||
"feedback": {
|
||||
"type": "ARRAY",
|
||||
"description": "List of feedback items",
|
||||
"items": {
|
||||
"type": "OBJECT",
|
||||
"properties": {
|
||||
"text": {"type": "STRING", "description": "Feedback content"},
|
||||
"box_2d": {
|
||||
"type": "ARRAY",
|
||||
"items": {"type": "INTEGER"},
|
||||
"nullable": True,
|
||||
"description": "box coordinates or null"
|
||||
}
|
||||
},
|
||||
"required": ["text"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["score", "confidence", "feedback", "error"]
|
||||
}
|
||||
},
|
||||
"required": ["id", "result"]
|
||||
}
|
||||
}
|
||||
|
||||
# The root model for parsing is be: List[EvaluationEntry]
|
||||
def generate_request(file, full_label):
|
||||
"""Generates request for Gemini."""
|
||||
|
|
@ -780,7 +818,8 @@ if __name__ == "__main__":
|
|||
"topP": 0.95,
|
||||
"maxOutputTokens": 65535,
|
||||
"responseMimeType": "application/json",
|
||||
"responseSchema": TypeAdapter(List[EvaluationEntry]).json_schema()
|
||||
"responseSchema": UNROLLED_SCHEMA
|
||||
# TypeAdapter(List[EvaluationEntry]).json_schema()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -800,7 +839,7 @@ if __name__ == "__main__":
|
|||
|
||||
batched_responses = {}
|
||||
if args.deal_with_batched:
|
||||
batch_results_path = Path(args.deal_with_batched)
|
||||
batch_results_path = Path(INPUT_DIR) / "batched_correction_result.jsonl"
|
||||
if batch_results_path.exists():
|
||||
print(f"Loading batch results from {batch_results_path}...")
|
||||
with open(batch_results_path, "r", encoding="utf-8") as f:
|
||||
|
|
|
|||
|
|
@ -7395,6 +7395,7 @@ ennuis
|
|||
ennuyer
|
||||
ennuyeux
|
||||
énoncer
|
||||
énoncé
|
||||
énonciation
|
||||
énorme
|
||||
énormément
|
||||
|
|
|
|||
|
|
@ -39,23 +39,47 @@ def fast_fix(text):
|
|||
broken_word = match.group(0)
|
||||
# Return the fixed word from our map, or leave it if not found
|
||||
# (Handles case-insensitivity by falling back to lowercase map)
|
||||
return lookup_map.get(broken_word.lower(), broken_word)
|
||||
fixed = lookup_map.get(broken_word.lower())
|
||||
# if not fixed:
|
||||
# print(f"No match found for: {repr(broken_word)}")
|
||||
return fixed or broken_word
|
||||
|
||||
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
|
||||
return text
|
||||
# return text
|
||||
|
||||
|
||||
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
||||
OUTPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
||||
|
||||
def fix_hex_corruption_safe(text):
|
||||
# Only matches \x00 followed by hex if it results in an accented character
|
||||
# or common Latin-1 symbols
|
||||
return re.sub(r'\x00([eEfF][0-9a-fA-F])',
|
||||
lambda m: chr(int(m.group(1), 16)),
|
||||
text)
|
||||
|
||||
def some_other_replacements(s):
|
||||
s = s.replace("\neq", "\\neq")
|
||||
s = s.replace("\not", "\\not")
|
||||
return s
|
||||
|
||||
|
||||
def clean_string(s: str) -> str:
|
||||
# fix encoding issues
|
||||
s = ftfy.fix_text(s)
|
||||
# s = ftfy.fix_text(s)
|
||||
# print(s)
|
||||
s = fix_hex_corruption_safe(s)
|
||||
|
||||
s = s.replace('\x19', '\x00')
|
||||
s = s.replace('\x18', '\x00')
|
||||
s = s.replace('\x00\x00', '\x00')
|
||||
|
||||
s = re.sub(r' \x00{1,2} ', ' à ', s)
|
||||
|
||||
if '\x00' in s:
|
||||
s = fast_fix(s)
|
||||
s = s.replace('\x00', '')
|
||||
return s
|
||||
return some_other_replacements(s)
|
||||
|
||||
|
||||
def clean_obj(obj):
|
||||
|
|
|
|||
Loading…
Reference in New Issue