Copies/grouping.py

290 lines
9.1 KiB
Python

import os
import json
import re
import sys
import shutil
from collections import defaultdict
from PIL import Image, ImageDraw, ImageFont
from pdf2image import convert_from_path, pdfinfo_from_path
# Configuration
DPI = 200 # Good balance for readability and size
A4_HEIGHT_INCHES = 11.69
FULL_PAGE_PX = int(A4_HEIGHT_INCHES * DPI)
MAX_GROUP_HEIGHT = 2. * FULL_PAGE_PX
MAX_GROUP_COUNT = 15
SEPARATOR_HEIGHT = 20
LABEL_HEIGHT = 50
MAX_FILE_SIZE_BYTES = 2.5 * 1024 * 1024 # 2MB
# def get_pdf_height(path):
# """Returns height in pixels at defined DPI without rendering."""
# try:
# info = pdfinfo_from_path(path)
# # info["Page size"] is usually "width height pts"
# # 1 pt = 1/72 inch
# # We assume single page PDFs as per prompt implication, or take the first page
# pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0
# return int((pts_height / 72.0) * DPI)
# except Exception as e:
# print(f"Error reading {path}: {e}")
# return 0
def get_pdf_height(path):
"""Returns total height of all pages in pixels at defined DPI."""
try:
info = pdfinfo_from_path(path)
# Get page count (default to 1)
num_pages = int(info["Pages"]) if "Pages" in info else 1
# 1 pt = 1/72 inch
pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0
# Height of one page in pixels
single_page_px = int((pts_height / 72.0) * DPI)
# Return total height
return single_page_px * num_pages
except Exception as e:
print(f"Error reading {path}: {e}")
return 0
def collect_files(root_dir):
"""
Scans Dir/Copiedd/identifier.pdf
Returns dict: {identifier: [(dd, path, height), ...]}
"""
data = defaultdict(list)
# Regex to match 'Copie' followed by 2 digits
folder_pattern = re.compile(r'Copie(\d{2})')
for root, dirs, files in os.walk(root_dir):
folder_name = os.path.basename(root)
match = folder_pattern.match(folder_name)
if match:
dd = match.group(1)
for file in files:
if file.lower().endswith('.pdf'):
identifier = os.path.splitext(file)[0]
full_path = os.path.join(root, file)
# Calculate height (c)
height = get_pdf_height(full_path)
# Store triple (a, b, c)
data[identifier].append((dd, full_path, height))
return data
def group_files(file_list):
"""
Groups files using First Fit Decreasing algorithm to minimize group count.
"""
# 1. Sort by height DESCENDING. Large items are hardest to fit, handle them first.
# (Remove this sort if you must strictly preserve input order logic)
sorted_files = sorted(file_list, key=lambda x: x[2], reverse=True)
# Each group is a dict: {'items': [], 'current_height': 0}
groups = []
for item in sorted_files:
dd, path, height = item
placed = False
# 2. Try to fit item into an existing group (First Fit)
for group in groups:
# Check Count Constraint
if len(group['items']) >= MAX_GROUP_COUNT:
continue
# Calculate Overhead (only if group is not empty)
overhead = (SEPARATOR_HEIGHT + 30) if group['items'] else 0
# Check Height Constraint
if (group['current_height'] + height + overhead) <= MAX_GROUP_HEIGHT:
group['items'].append(item)
group['current_height'] += height + overhead
placed = True
break
# 3. If it doesn't fit anywhere, create a new group
if not placed:
groups.append({
'items': [item],
'current_height': height
})
# Return list of lists (strip the metadata)
return [g['items'] for g in groups]
# def group_files(file_list):
# """Groups files based on constraints."""
# sorted_files = sorted(file_list, key=lambda x: x[0])
# groups = []
# current_group = []
# current_height = 0
# for item in sorted_files:
# dd, path, height = item
# # Calculate added height (image + separator + approx text space)
# # We add separator height only if it's not the first image
# added_overhead = SEPARATOR_HEIGHT + 30 if current_group else 0
# # Check conditions
# if (len(current_group) >= MAX_GROUP_COUNT or
# (current_height + height + added_overhead) > MAX_GROUP_HEIGHT):
# # Push current group and start new
# if current_group:
# groups.append(current_group)
# current_group = []
# current_height = 0
# added_overhead = 0 # Reset for first file of new group
# current_group.append(item)
# current_height += height + added_overhead
# if current_group:
# groups.append(current_group)
# return groups
def stitch_pdf_pages(images_list):
"""Vertically concatenates a list of PIL images with no separator."""
if not images_list:
return None
if len(images_list) == 1:
return images_list[0]
max_width = max(img.width for img in images_list)
total_height = sum(img.height for img in images_list)
combined = Image.new('RGB', (max_width, total_height), 'white')
y_offset = 0
for img in images_list:
combined.paste(img, (0, y_offset))
y_offset += img.height
return combined
def create_jpg(identifier, group_index, group, root_dir):
images = []
metadata = [] # To store (id, h_min, h_max)
# Render PDFs to images
for dd, path, _ in group:
try:
# Convert pdf to image
imgs = convert_from_path(path, dpi=DPI)
# if imgs:
# images.append((dd, imgs[0])) # Assume 1 page per pdf !! ??
if imgs:
# Concatenate multi-page PDFs into one single image object
combined_img = stitch_pdf_pages(imgs)
if combined_img:
images.append((dd, combined_img))
except Exception as e:
print(f"Failed to convert {path}: {e}")
if not images:
return
# Calculate total canvas size
total_width = max(img.width for _, img in images)
total_height = sum(img.height for _, img in images) + ((len(images) - 1) * SEPARATOR_HEIGHT)
# Add space for text (approx 40px per label)
total_height += len(images) * LABEL_HEIGHT
canvas = Image.new('RGB', (total_width, total_height), 'white')
draw = ImageDraw.Draw(canvas)
# Try loading a font, fallback to default
try:
font = ImageFont.truetype("DejaVuSans.ttf", 40)
except IOError:
print("font not found")
font = ImageFont.load_default()
y_offset = 0
for i, (dd, img) in enumerate(images):
# Draw separator if not first image
if i > 0:
draw.rectangle([0, y_offset, total_width, y_offset + SEPARATOR_HEIGHT], fill='black')
y_offset += SEPARATOR_HEIGHT
# Draw Text (dd)
text = f"ID: {dd}"
draw.text((10, y_offset + 5), text, fill='black', font=font)
y_offset += LABEL_HEIGHT # Space for text
# Record Image Coordinates
h_min = y_offset
h_max = y_offset + img.height
metadata.append((dd, h_min, h_max))
# Draw Image
x_pos = 0
canvas.paste(img, (x_pos, y_offset))
y_offset += img.height
target_folder = os.path.join(root_dir, identifier)
os.makedirs(target_folder, exist_ok=True)
# Save JSON metadata
json_filename = f"Group_{group_index+1}.json"
json_path = os.path.join(target_folder, json_filename)
with open(json_path, 'w') as f:
json.dump(metadata, f)
# Save with size constraints
output_filename = f"Group_{group_index+1}.jpg"
output_path = os.path.join(target_folder, output_filename)
quality = 90
while quality > 10:
canvas.save(output_path, "JPEG", quality=quality, optimize=True)
if os.path.getsize(output_path) <= MAX_FILE_SIZE_BYTES:
if quality < 90:
print("quality : ", quality)
break
quality -= 5
print(f"Saved {output_path} ({os.path.getsize(output_path)/1024/1024:.2f} MB)")
def main():
if len(sys.argv) < 2:
print("Usage: python app.py <Path_to_Dir>")
sys.exit(1)
root_dir = sys.argv[1]
print("Scanning files...")
data = collect_files(root_dir)
print(f"Found {len(data)} identifiers. Processing...")
for identifier, files_info in data.items():
# Clear output directory if it exists
target_folder = os.path.join(root_dir, identifier)
if os.path.exists(target_folder):
shutil.rmtree(target_folder)
os.makedirs(target_folder, exist_ok=True)
# files_info is list of (dd, path, height)
file_groups = group_files(files_info)
for idx, group in enumerate(file_groups):
create_jpg(identifier, idx, group, root_dir)
print("Done.")
if __name__ == "__main__":
main()