#!/usr/bin/env python3
"""
Volledige cleanup en deploy van de wget-scrape:
1. Verwijder WordPress ?p=... en ?replytocom=... rommel
2. Hernoem assets: verwijder ?ver=... uit bestandsnaam
3. Update HTML/CSS referenties
4. Sync naar /var/www/correctvloerverwarming2/
"""

import os
import re
import shutil
from pathlib import Path

SRC_DIR = Path("/home/anisy/projects/websites/correctvloerverwarming/correctvloerverwarming.nl")
DST_DIR = Path("/var/www/correctvloerverwarming2")

# ─── Stap 1: Verwijder WordPress preview rommel ────────────────────────────────
def delete_wp_junk():
    deleted = 0
    junk_patterns = [
        r"\?p=\d+",           # WordPress post previews
        r"\?replytocom=",     # Comment reply
        r"wp-json",           # WordPress JSON API (niet nodig voor statische site)
    ]
    for f in list(SRC_DIR.rglob("*")):
        name = f.name
        if any(re.search(pat, name) for pat in junk_patterns):
            if f.is_file():
                f.unlink()
                deleted += 1
            elif f.is_dir():
                shutil.rmtree(f)
                deleted += 1
    print(f"  Verwijderd: {deleted} WordPress junk bestanden/folders")

# ─── Stap 2: Hernoem assets (verwijder ?ver=... / versie suffixen) ─────────────
def rename_versioned_assets():
    renamed = 0
    ASSET_EXTS = {'.js', '.css', '.woff', '.woff2', '.ttf', '.eot', '.svg', '.png',
                  '.jpg', '.jpeg', '.webp', '.gif', '.ico'}

    for f in list(SRC_DIR.rglob("*")):
        if not f.is_file():
            continue
        name = f.name
        if '?' not in name:
            continue

        # Haal het schone deel vóór '?' op
        clean_name = name.split('?')[0]
        ext = Path(clean_name).suffix.lower()

        # Alleen assets hernoemen, niet HTML pagina's met ?p=...
        if ext not in ASSET_EXTS:
            continue

        new_path = f.parent / clean_name
        if new_path.exists():
            f.unlink()  # duplicate, verwijder de versioned kopie
        else:
            f.rename(new_path)
            renamed += 1

    print(f"  Hernoemd: {renamed} asset bestanden")

# ─── Stap 3: Fix referenties in HTML en CSS ────────────────────────────────────
def fix_references():
    html_count = css_count = 0

    for html_path in SRC_DIR.rglob("*.html"):
        try:
            content = html_path.read_text(encoding="utf-8", errors="ignore")
            original = content

            # Verwijder ?ver=... en %3Fver=... uit asset URLs
            content = re.sub(
                r'((?:\.js|\.css|\.woff2?|\.ttf|\.eot|\.svg|\.png|\.jpg|\.jpeg|\.webp|\.gif|\.ico))(?:%3F|\?)[^"\'&\s>)]+',
                r'\1',
                content,
                flags=re.IGNORECASE
            )

            if content != original:
                html_path.write_text(content, encoding="utf-8")
                html_count += 1
        except Exception as e:
            print(f"  Fout HTML {html_path.name}: {e}")

    for css_path in SRC_DIR.rglob("*.css"):
        try:
            content = css_path.read_text(encoding="utf-8", errors="ignore")
            original = content

            content = re.sub(
                r'((?:\.js|\.css|\.woff2?|\.ttf|\.eot|\.svg|\.png|\.jpg|\.jpeg|\.webp|\.gif|\.ico))\?[^"\')\s;]+',
                r'\1',
                content,
                flags=re.IGNORECASE
            )

            if content != original:
                css_path.write_text(content, encoding="utf-8")
                css_count += 1
        except Exception as e:
            print(f"  Fout CSS {css_path.name}: {e}")

    print(f"  HTML bijgewerkt: {html_count} bestanden")
    print(f"  CSS bijgewerkt:  {css_count} bestanden")

# ─── Stap 4: Fix absolute URLs → relatieve paden ──────────────────────────────
def fix_absolute_urls():
    DOMAINS = [
        "https://www.correctvloerverwarming.nl",
        "http://www.correctvloerverwarming.nl",
        "https://correctvloerverwarming.nl",
        "http://correctvloerverwarming.nl",
    ]
    count = 0
    for html_path in SRC_DIR.rglob("*.html"):
        try:
            content = html_path.read_text(encoding="utf-8", errors="ignore")
            original = content
            depth = len(html_path.relative_to(SRC_DIR).parts) - 1
            prefix = "../" * depth if depth > 0 else "./"
            for domain in DOMAINS:
                content = content.replace(f'href="{domain}/', f'href="{prefix}')
                content = content.replace(f"href='{domain}/", f"href='{prefix}")
                content = content.replace(f'src="{domain}/', f'src="{prefix}')
                content = content.replace(f"src='{domain}/", f"src='{prefix}")
                content = content.replace(f'action="{domain}/', f'action="{prefix}')
            if content != original:
                html_path.write_text(content, encoding="utf-8")
                count += 1
        except Exception as e:
            print(f"  Fout URL fix {html_path.name}: {e}")
    print(f"  Absolute URLs gefixed: {count} HTML bestanden")

# ─── Stap 5: Sync naar webroot ─────────────────────────────────────────────────
def sync_to_webroot():
    if DST_DIR.exists():
        shutil.rmtree(DST_DIR)
    shutil.copytree(SRC_DIR, DST_DIR)
    # Rechten instellen
    os.system(f"chown -R www-data:www-data {DST_DIR}")
    os.system(f"chmod -R 755 {DST_DIR}")
    total = sum(1 for _ in DST_DIR.rglob("*") if _.is_file())
    print(f"  {total} bestanden gesynchroniseerd naar {DST_DIR}")

# ─── Hoofdprogramma ────────────────────────────────────────────────────────────
if __name__ == "__main__":
    print("\n[1/5] WordPress junk verwijderen...")
    delete_wp_junk()

    print("\n[2/5] Asset bestanden hernoemen...")
    rename_versioned_assets()

    print("\n[3/5] Asset referenties in HTML/CSS fixen...")
    fix_references()

    print("\n[4/5] Absolute URLs → relatieve paden...")
    fix_absolute_urls()

    print("\n[5/5] Sync naar webroot...")
    sync_to_webroot()

    print("\nKlaar! Site bereikbaar op https://correctvloerverwarming2.youztech.nl")