import os
import xml.etree.ElementTree as ET
from PIL import Image

# Setup paths
xml_file = 'Persian-web-page.xml'
img_file = 'upscaled_Persian-web-page.png'
output_dir = 'ground_truth'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Parse ALTO XML
tree = ET.parse(xml_file)
root = tree.getroot()
ns = {'alto': 'http://www.loc.gov/standards/alto/ns-v4#'}

img = Image.open(img_file)
count = 0

# Find all TextLines
for line in root.findall('.//alto:TextLine', ns):
    # Get coordinates
    h = int(line.get('HEIGHT'))
    w = int(line.get('WIDTH'))
    v = int(line.get('VPOS'))
    h_pos = int(line.get('HPOS'))
    
    # Get the OCR text
    content = ""
    for string in line.findall('alto:String', ns):
        content += string.get('CONTENT') + " "
    
    # Crop and save
    line_img = img.crop((h_pos, v, h_pos + w, v + h))
    base_name = f"line_{count:03d}"
    line_img.save(os.path.join(output_dir, f"{base_name}.png"))
    
    # Save the text for you to edit
    with open(os.path.join(output_dir, f"{base_name}.gt.txt"), "w", encoding="utf-8") as f:
        f.write(content.strip())
    
    count += 1

print(f"Extracted {count} lines to {output_dir}/")
