import os
import requests

# --- CONFIGURATION ---
OUTPUT_DIR = "manuscripts"

# Verified Live URLs provided for Farhangan.com testing
SAMPLES = {
    "Baba_Taher_Nastaliq.png": "https://upload.wikimedia.org/wikipedia/commons/thumb/8/85/Baba_Taher_Agar_dar_Masgedi.svg/1280px-Baba_Taher_Agar_dar_Masgedi.svg.png",
    "Masnavi_Manuscript.jpg": "https://upload.wikimedia.org/wikipedia/commons/1/1b/First_18_couplets_from_a_copy_of_Masnavi-ye-Ma%27navi.jpg"
}

def download_samples():
    """Downloads verified Persian text samples to the manuscripts directory."""
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"📁 Created directory: {OUTPUT_DIR}")

    # Using a standard browser header to avoid 403 Forbidden errors
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) FarhanganOCR/1.0'
    }
    
    downloaded_count = 0

    print("📡 Fetching verified Persian samples...")
    for filename, url in SAMPLES.items():
        try:
            print(f"📥 Downloading: {filename}...")
            response = requests.get(url, headers=headers, timeout=25)
            
            if response.status_code == 200:
                filepath = os.path.join(OUTPUT_DIR, filename)
                with open(filepath, 'wb') as f:
                    f.write(response.content)
                print(f"✅ Saved to {filepath}")
                downloaded_count += 1
            else:
                print(f"❌ Failed: {filename} (HTTP Status: {response.status_code})")
                
        except Exception as e:
            print(f"⚠️ Connection error for {filename}: {e}")

    print("\n--- Summary ---")
    if downloaded_count > 0:
        print(f"🎉 Successfully downloaded {downloaded_count} files to '{OUTPUT_DIR}'.")
        print("💡 You can now run your OCR manually:")
        print("   bash ocr_batch.sh")
    else:
        print("❌ No files were downloaded. Please check your internet connection.")

if __name__ == "__main__":
    download_samples()
