249 lines
7.6 KiB
Python
249 lines
7.6 KiB
Python
import os
|
|
import json
|
|
import re
|
|
import sys
|
|
import shutil
|
|
from collections import defaultdict
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
from pdf2image import convert_from_path, pdfinfo_from_path
|
|
|
|
# Configuration
|
|
DPI = 200 # Good balance for readability and size
|
|
A4_HEIGHT_INCHES = 11.69
|
|
FULL_PAGE_PX = int(A4_HEIGHT_INCHES * DPI)
|
|
MAX_GROUP_HEIGHT = 2.5 * FULL_PAGE_PX
|
|
MAX_GROUP_COUNT = 15
|
|
SEPARATOR_HEIGHT = 20
|
|
LABEL_HEIGHT = 50
|
|
MAX_FILE_SIZE_BYTES = 2.5 * 1024 * 1024 # 2MB
|
|
|
|
|
|
# def get_pdf_height(path):
|
|
# """Returns height in pixels at defined DPI without rendering."""
|
|
# try:
|
|
# info = pdfinfo_from_path(path)
|
|
# # info["Page size"] is usually "width height pts"
|
|
# # 1 pt = 1/72 inch
|
|
# # We assume single page PDFs as per prompt implication, or take the first page
|
|
# pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0
|
|
# return int((pts_height / 72.0) * DPI)
|
|
# except Exception as e:
|
|
# print(f"Error reading {path}: {e}")
|
|
# return 0
|
|
|
|
def get_pdf_height(path):
|
|
"""Returns total height of all pages in pixels at defined DPI."""
|
|
try:
|
|
info = pdfinfo_from_path(path)
|
|
# Get page count (default to 1)
|
|
num_pages = int(info["Pages"]) if "Pages" in info else 1
|
|
|
|
# 1 pt = 1/72 inch
|
|
pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0
|
|
|
|
# Height of one page in pixels
|
|
single_page_px = int((pts_height / 72.0) * DPI)
|
|
|
|
# Return total height
|
|
return single_page_px * num_pages
|
|
except Exception as e:
|
|
print(f"Error reading {path}: {e}")
|
|
return 0
|
|
|
|
def collect_files(root_dir):
|
|
"""
|
|
Scans Dir/Copiedd/identifier.pdf
|
|
Returns dict: {identifier: [(dd, path, height), ...]}
|
|
"""
|
|
data = defaultdict(list)
|
|
|
|
# Regex to match 'Copie' followed by 2 digits
|
|
folder_pattern = re.compile(r'Copie(\d{2})')
|
|
|
|
for root, dirs, files in os.walk(root_dir):
|
|
folder_name = os.path.basename(root)
|
|
match = folder_pattern.match(folder_name)
|
|
|
|
if match:
|
|
dd = match.group(1)
|
|
for file in files:
|
|
if file.lower().endswith('.pdf'):
|
|
identifier = os.path.splitext(file)[0]
|
|
full_path = os.path.join(root, file)
|
|
|
|
# Calculate height (c)
|
|
height = get_pdf_height(full_path)
|
|
|
|
# Store triple (a, b, c)
|
|
data[identifier].append((dd, full_path, height))
|
|
return data
|
|
|
|
def group_files(file_list):
|
|
"""Groups files based on constraints."""
|
|
sorted_files = sorted(file_list, key=lambda x: x[0])
|
|
|
|
groups = []
|
|
current_group = []
|
|
current_height = 0
|
|
|
|
for item in sorted_files:
|
|
dd, path, height = item
|
|
|
|
# Calculate added height (image + separator + approx text space)
|
|
# We add separator height only if it's not the first image
|
|
added_overhead = SEPARATOR_HEIGHT + 30 if current_group else 0
|
|
|
|
# Check conditions
|
|
if (len(current_group) >= MAX_GROUP_COUNT or
|
|
(current_height + height + added_overhead) > MAX_GROUP_HEIGHT):
|
|
|
|
# Push current group and start new
|
|
if current_group:
|
|
groups.append(current_group)
|
|
current_group = []
|
|
current_height = 0
|
|
added_overhead = 0 # Reset for first file of new group
|
|
|
|
current_group.append(item)
|
|
current_height += height + added_overhead
|
|
|
|
if current_group:
|
|
groups.append(current_group)
|
|
|
|
return groups
|
|
|
|
def stitch_pdf_pages(images_list):
|
|
"""Vertically concatenates a list of PIL images with no separator."""
|
|
if not images_list:
|
|
return None
|
|
if len(images_list) == 1:
|
|
return images_list[0]
|
|
|
|
max_width = max(img.width for img in images_list)
|
|
total_height = sum(img.height for img in images_list)
|
|
|
|
combined = Image.new('RGB', (max_width, total_height), 'white')
|
|
|
|
y_offset = 0
|
|
for img in images_list:
|
|
combined.paste(img, (0, y_offset))
|
|
y_offset += img.height
|
|
|
|
return combined
|
|
|
|
def create_jpg(identifier, group_index, group, root_dir):
|
|
images = []
|
|
metadata = [] # To store (id, h_min, h_max)
|
|
|
|
# Render PDFs to images
|
|
for dd, path, _ in group:
|
|
try:
|
|
# Convert pdf to image
|
|
imgs = convert_from_path(path, dpi=DPI)
|
|
# if imgs:
|
|
# images.append((dd, imgs[0])) # Assume 1 page per pdf !! ??
|
|
if imgs:
|
|
# Concatenate multi-page PDFs into one single image object
|
|
combined_img = stitch_pdf_pages(imgs)
|
|
if combined_img:
|
|
images.append((dd, combined_img))
|
|
except Exception as e:
|
|
print(f"Failed to convert {path}: {e}")
|
|
|
|
if not images:
|
|
return
|
|
|
|
# Calculate total canvas size
|
|
total_width = max(img.width for _, img in images)
|
|
total_height = sum(img.height for _, img in images) + ((len(images) - 1) * SEPARATOR_HEIGHT)
|
|
|
|
# Add space for text (approx 40px per label)
|
|
total_height += len(images) * LABEL_HEIGHT
|
|
|
|
canvas = Image.new('RGB', (total_width, total_height), 'white')
|
|
draw = ImageDraw.Draw(canvas)
|
|
|
|
# Try loading a font, fallback to default
|
|
try:
|
|
font = ImageFont.truetype("DejaVuSans.ttf", 40)
|
|
except IOError:
|
|
print("font not found")
|
|
font = ImageFont.load_default()
|
|
|
|
y_offset = 0
|
|
|
|
for i, (dd, img) in enumerate(images):
|
|
# Draw separator if not first image
|
|
if i > 0:
|
|
draw.rectangle([0, y_offset, total_width, y_offset + SEPARATOR_HEIGHT], fill='black')
|
|
y_offset += SEPARATOR_HEIGHT
|
|
|
|
# Draw Text (dd)
|
|
text = f"ID: {dd}"
|
|
draw.text((10, y_offset + 5), text, fill='black', font=font)
|
|
y_offset += LABEL_HEIGHT # Space for text
|
|
|
|
# Record Image Coordinates
|
|
h_min = y_offset
|
|
h_max = y_offset + img.height
|
|
metadata.append((dd, h_min, h_max))
|
|
|
|
# Draw Image
|
|
x_pos = 0
|
|
canvas.paste(img, (x_pos, y_offset))
|
|
y_offset += img.height
|
|
|
|
target_folder = os.path.join(root_dir, identifier)
|
|
os.makedirs(target_folder, exist_ok=True)
|
|
|
|
# Save JSON metadata
|
|
json_filename = f"Group_{group_index+1}.json"
|
|
json_path = os.path.join(target_folder, json_filename)
|
|
with open(json_path, 'w') as f:
|
|
json.dump(metadata, f)
|
|
|
|
# Save with size constraints
|
|
output_filename = f"Group_{group_index+1}.jpg"
|
|
output_path = os.path.join(target_folder, output_filename)
|
|
|
|
quality = 90
|
|
while quality > 10:
|
|
canvas.save(output_path, "JPEG", quality=quality, optimize=True)
|
|
if os.path.getsize(output_path) <= MAX_FILE_SIZE_BYTES:
|
|
if quality < 90:
|
|
print("quality : ", quality)
|
|
break
|
|
quality -= 5
|
|
|
|
print(f"Saved {output_path} ({os.path.getsize(output_path)/1024/1024:.2f} MB)")
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python app.py <Path_to_Dir>")
|
|
sys.exit(1)
|
|
|
|
root_dir = sys.argv[1]
|
|
|
|
print("Scanning files...")
|
|
data = collect_files(root_dir)
|
|
|
|
print(f"Found {len(data)} identifiers. Processing...")
|
|
|
|
for identifier, files_info in data.items():
|
|
# Clear output directory if it exists
|
|
target_folder = os.path.join(root_dir, identifier)
|
|
if os.path.exists(target_folder):
|
|
shutil.rmtree(target_folder)
|
|
os.makedirs(target_folder, exist_ok=True)
|
|
|
|
# files_info is list of (dd, path, height)
|
|
file_groups = group_files(files_info)
|
|
|
|
for idx, group in enumerate(file_groups):
|
|
create_jpg(identifier, idx, group, root_dir)
|
|
|
|
print("Done.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|