miscs
parent
bc47f81556
commit
dd0d757fc9
|
|
@ -12,8 +12,7 @@ import annotating_with_checks
|
||||||
|
|
||||||
from utils import natural_key
|
from utils import natural_key
|
||||||
|
|
||||||
# Roughly 10 A4 pages at 100 DPI
|
MAX_HEIGHT_PX = 25000 # Can be increased by 10%.
|
||||||
MAX_HEIGHT_PX = 20000 # Can be increased by 10%.
|
|
||||||
|
|
||||||
def render_item(item):
|
def render_item(item):
|
||||||
student_id, label, content = item
|
student_id, label, content = item
|
||||||
|
|
@ -134,15 +133,32 @@ def main():
|
||||||
shutil.rmtree(bgnot_dir)
|
shutil.rmtree(bgnot_dir)
|
||||||
os.makedirs(bgnot_dir, exist_ok=True)
|
os.makedirs(bgnot_dir, exist_ok=True)
|
||||||
|
|
||||||
|
used_prefixes = set()
|
||||||
|
|
||||||
|
previous_prefix = None
|
||||||
for line in lines:
|
for line in lines:
|
||||||
labels = [l.strip() for l in line.split(',') if l.strip()]
|
labels = [l.strip() for l in line.split(',') if l.strip()]
|
||||||
safe_labels = [l.replace(":", "").strip() for l in line.split(',') if l.strip()]
|
safe_labels = [l.replace(":", "").strip() for l in line.split(',') if l.strip()]
|
||||||
if not labels:
|
if not labels:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
prefix = os.path.commonprefix(safe_labels).strip()
|
base_prefix = os.path.commonprefix(safe_labels).strip()
|
||||||
if not prefix:
|
if not base_prefix:
|
||||||
prefix = "Group"
|
base_prefix = "Group"
|
||||||
|
|
||||||
|
unique_prefix = base_prefix
|
||||||
|
if unique_prefix[-1] == "i":
|
||||||
|
unique_prefix = unique_prefix[:-1]
|
||||||
|
counter = 2
|
||||||
|
while unique_prefix in used_prefixes:
|
||||||
|
unique_prefix = f"{base_prefix}-{counter}"
|
||||||
|
counter += 1
|
||||||
|
if counter == 2 and previous_prefix and previous_prefix in unique_prefix:
|
||||||
|
unique_prefix = f"{previous_prefix}-{counter}"
|
||||||
|
elif counter == 2:
|
||||||
|
previous_prefx = unique_prefix
|
||||||
|
|
||||||
|
used_prefixes.add(unique_prefix)
|
||||||
|
|
||||||
items_to_render = []
|
items_to_render = []
|
||||||
for sid, lbls in results.items():
|
for sid, lbls in results.items():
|
||||||
|
|
@ -201,7 +217,7 @@ def main():
|
||||||
batches = batches2
|
batches = batches2
|
||||||
|
|
||||||
for i, batch in enumerate(batches, 1):
|
for i, batch in enumerate(batches, 1):
|
||||||
save_batch(batch, prefix, i, root_dir, args.overwrite)
|
save_batch(batch, unique_prefix, i, root_dir, args.overwrite)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ parser.add_argument("--refaire", action="store_true",
|
||||||
help="Redo specific copies/labels defined in refaire.json")
|
help="Redo specific copies/labels defined in refaire.json")
|
||||||
parser.add_argument("--batch", action="store_true",
|
parser.add_argument("--batch", action="store_true",
|
||||||
help="Generate a JSONL file of requests to send to the Gemini Batch API")
|
help="Generate a JSONL file of requests to send to the Gemini Batch API")
|
||||||
parser.add_argument("--deal-with-batched", type=str, metavar="FILE",
|
parser.add_argument("--deal-with-batched", action="store_true",
|
||||||
help="Process a JSONL file containing completed batch results")
|
help="Process a JSONL file containing completed batch results")
|
||||||
args, _ = parser.parse_known_args()
|
args, _ = parser.parse_known_args()
|
||||||
|
|
||||||
|
|
@ -236,6 +236,44 @@ class EvaluationEntry(BaseModel):
|
||||||
id: str = Field(description="Entry identifier")
|
id: str = Field(description="Entry identifier")
|
||||||
result: ResultData = Field(description="Result details")
|
result: ResultData = Field(description="Result details")
|
||||||
|
|
||||||
|
# These nested definitions do not work with the batch api, unroll them
|
||||||
|
UNROLLED_SCHEMA = {
|
||||||
|
"type": "ARRAY",
|
||||||
|
"items": {
|
||||||
|
"type": "OBJECT",
|
||||||
|
"properties": {
|
||||||
|
"id": {"type": "STRING", "description": "Entry identifier"},
|
||||||
|
"result": {
|
||||||
|
"type": "OBJECT",
|
||||||
|
"properties": {
|
||||||
|
"score": {"type": "NUMBER", "description": "The numeric score"},
|
||||||
|
"confidence": {"type": "NUMBER", "description": "Confidence level"},
|
||||||
|
"error": {"type": "STRING", "description": "Indicates if an error occurred"},
|
||||||
|
"feedback": {
|
||||||
|
"type": "ARRAY",
|
||||||
|
"description": "List of feedback items",
|
||||||
|
"items": {
|
||||||
|
"type": "OBJECT",
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "STRING", "description": "Feedback content"},
|
||||||
|
"box_2d": {
|
||||||
|
"type": "ARRAY",
|
||||||
|
"items": {"type": "INTEGER"},
|
||||||
|
"nullable": True,
|
||||||
|
"description": "box coordinates or null"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["text"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["score", "confidence", "feedback", "error"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["id", "result"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# The root model for parsing is be: List[EvaluationEntry]
|
# The root model for parsing is be: List[EvaluationEntry]
|
||||||
def generate_request(file, full_label):
|
def generate_request(file, full_label):
|
||||||
"""Generates request for Gemini."""
|
"""Generates request for Gemini."""
|
||||||
|
|
@ -780,7 +818,8 @@ if __name__ == "__main__":
|
||||||
"topP": 0.95,
|
"topP": 0.95,
|
||||||
"maxOutputTokens": 65535,
|
"maxOutputTokens": 65535,
|
||||||
"responseMimeType": "application/json",
|
"responseMimeType": "application/json",
|
||||||
"responseSchema": TypeAdapter(List[EvaluationEntry]).json_schema()
|
"responseSchema": UNROLLED_SCHEMA
|
||||||
|
# TypeAdapter(List[EvaluationEntry]).json_schema()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -800,7 +839,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
batched_responses = {}
|
batched_responses = {}
|
||||||
if args.deal_with_batched:
|
if args.deal_with_batched:
|
||||||
batch_results_path = Path(args.deal_with_batched)
|
batch_results_path = Path(INPUT_DIR) / "batched_correction_result.jsonl"
|
||||||
if batch_results_path.exists():
|
if batch_results_path.exists():
|
||||||
print(f"Loading batch results from {batch_results_path}...")
|
print(f"Loading batch results from {batch_results_path}...")
|
||||||
with open(batch_results_path, "r", encoding="utf-8") as f:
|
with open(batch_results_path, "r", encoding="utf-8") as f:
|
||||||
|
|
|
||||||
|
|
@ -7395,6 +7395,7 @@ ennuis
|
||||||
ennuyer
|
ennuyer
|
||||||
ennuyeux
|
ennuyeux
|
||||||
énoncer
|
énoncer
|
||||||
|
énoncé
|
||||||
énonciation
|
énonciation
|
||||||
énorme
|
énorme
|
||||||
énormément
|
énormément
|
||||||
|
|
@ -21071,4 +21072,4 @@ zone
|
||||||
zones
|
zones
|
||||||
zoologie
|
zoologie
|
||||||
zozoter
|
zozoter
|
||||||
zygote
|
zygote
|
||||||
|
|
|
||||||
|
|
@ -39,23 +39,47 @@ def fast_fix(text):
|
||||||
broken_word = match.group(0)
|
broken_word = match.group(0)
|
||||||
# Return the fixed word from our map, or leave it if not found
|
# Return the fixed word from our map, or leave it if not found
|
||||||
# (Handles case-insensitivity by falling back to lowercase map)
|
# (Handles case-insensitivity by falling back to lowercase map)
|
||||||
return lookup_map.get(broken_word.lower(), broken_word)
|
fixed = lookup_map.get(broken_word.lower())
|
||||||
|
# if not fixed:
|
||||||
|
# print(f"No match found for: {repr(broken_word)}")
|
||||||
|
return fixed or broken_word
|
||||||
|
|
||||||
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
|
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
|
||||||
return text
|
# return text
|
||||||
|
|
||||||
|
|
||||||
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
||||||
OUTPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
OUTPUT_FILE = Path(INPUT_DIR) / "correction.json"
|
||||||
|
|
||||||
|
def fix_hex_corruption_safe(text):
|
||||||
|
# Only matches \x00 followed by hex if it results in an accented character
|
||||||
|
# or common Latin-1 symbols
|
||||||
|
return re.sub(r'\x00([eEfF][0-9a-fA-F])',
|
||||||
|
lambda m: chr(int(m.group(1), 16)),
|
||||||
|
text)
|
||||||
|
|
||||||
|
def some_other_replacements(s):
|
||||||
|
s = s.replace("\neq", "\\neq")
|
||||||
|
s = s.replace("\not", "\\not")
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
def clean_string(s: str) -> str:
|
def clean_string(s: str) -> str:
|
||||||
# fix encoding issues
|
# fix encoding issues
|
||||||
s = ftfy.fix_text(s)
|
# s = ftfy.fix_text(s)
|
||||||
|
# print(s)
|
||||||
|
s = fix_hex_corruption_safe(s)
|
||||||
|
|
||||||
s = s.replace('\x19', '\x00')
|
s = s.replace('\x19', '\x00')
|
||||||
s = s.replace('\x18', '\x00')
|
s = s.replace('\x18', '\x00')
|
||||||
s = fast_fix(s)
|
s = s.replace('\x00\x00', '\x00')
|
||||||
s = s.replace('\x00', '')
|
|
||||||
return s
|
s = re.sub(r' \x00{1,2} ', ' à ', s)
|
||||||
|
|
||||||
|
if '\x00' in s:
|
||||||
|
s = fast_fix(s)
|
||||||
|
s = s.replace('\x00', '')
|
||||||
|
return some_other_replacements(s)
|
||||||
|
|
||||||
|
|
||||||
def clean_obj(obj):
|
def clean_obj(obj):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue