import os
import re

def normalize_persian(text):
    # 1. Standardize Persian 'Kaf' and 'Yeh' (Direct string replacement is safest)
    text = text.replace('\u0643', '\u06a9')  # Arabic Kaf
    text = text.replace('\u0649', '\u06cc')  # Arabic Alif Maksura
    text = text.replace('\u064a', '\u06cc')  # Arabic Yeh

    # 2. ZWNJ (Nim-fasele) Implementation
    # We define the characters as variables to avoid any \u escape issues
    zwnj = '\u200c'
    ha = '\u0647\u0627'
    hay = '\u0647\u0627\u06cc'
    mi = '\u0645\u06cc'
    tar = '\u062a\u0631'

    # Handle "ها" (Plurals) - replaces space with ZWNJ
    text = re.sub(r' ' + ha + r'([\s\.\,])', zwnj + ha + r'\1', text)
    
    # Handle "های" (Plural possessive)
    text = re.sub(r' ' + hay + r'([\s\.\,])', zwnj + hay + r'\1', text)
    
    # Handle "می" (Prefix for verbs)
    text = re.sub(r'(^|\s)' + mi + r' ', r'\1' + mi + zwnj, text)
    
    # Handle "تر" and "ترین" (Adjectives)
    text = re.sub(r' ' + tar + r'(\u06cc\u0646)?([\s\.\,])', zwnj + tar + r'\1\2', text)

    # 3. Clean up extra spaces
    text = re.sub(r' +', ' ', text)
    
    return text

def process_files():
    for filename in os.listdir('.'):
        if filename.endswith('.txt') and not filename.startswith('norm_'):
            try:
                with open(filename, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                clean_content = normalize_persian(content)
                
                output_name = f"norm_{filename}"
                with open(output_name, 'w', encoding='utf-8') as f:
                    f.write(clean_content)
                print(f"Successfully normalized: {filename} -> {output_name}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")

if __name__ == "__main__":
    process_files()
