#!/usr/bin/env python3 """ Volledige cleanup en deploy van de wget-scrape: 1. Verwijder WordPress ?p=... en ?replytocom=... rommel 2. Hernoem assets: verwijder ?ver=... uit bestandsnaam 3. Update HTML/CSS referenties 4. Sync naar /var/www/correctvloerverwarming2/ """ import os import re import shutil from pathlib import Path SRC_DIR = Path("/home/anisy/projects/websites/correctvloerverwarming/correctvloerverwarming.nl") DST_DIR = Path("/var/www/correctvloerverwarming2") # ─── Stap 1: Verwijder WordPress preview rommel ──────────────────────────────── def delete_wp_junk(): deleted = 0 junk_patterns = [ r"\?p=\d+", # WordPress post previews r"\?replytocom=", # Comment reply r"wp-json", # WordPress JSON API (niet nodig voor statische site) ] for f in list(SRC_DIR.rglob("*")): name = f.name if any(re.search(pat, name) for pat in junk_patterns): if f.is_file(): f.unlink() deleted += 1 elif f.is_dir(): shutil.rmtree(f) deleted += 1 print(f" Verwijderd: {deleted} WordPress junk bestanden/folders") # ─── Stap 2: Hernoem assets (verwijder ?ver=... / versie suffixen) ───────────── def rename_versioned_assets(): renamed = 0 ASSET_EXTS = {'.js', '.css', '.woff', '.woff2', '.ttf', '.eot', '.svg', '.png', '.jpg', '.jpeg', '.webp', '.gif', '.ico'} for f in list(SRC_DIR.rglob("*")): if not f.is_file(): continue name = f.name if '?' not in name: continue # Haal het schone deel vóór '?' op clean_name = name.split('?')[0] ext = Path(clean_name).suffix.lower() # Alleen assets hernoemen, niet HTML pagina's met ?p=... if ext not in ASSET_EXTS: continue new_path = f.parent / clean_name if new_path.exists(): f.unlink() # duplicate, verwijder de versioned kopie else: f.rename(new_path) renamed += 1 print(f" Hernoemd: {renamed} asset bestanden") # ─── Stap 3: Fix referenties in HTML en CSS ──────────────────────────────────── def fix_references(): html_count = css_count = 0 for html_path in SRC_DIR.rglob("*.html"): try: content = html_path.read_text(encoding="utf-8", errors="ignore") original = content # Verwijder ?ver=... en %3Fver=... uit asset URLs content = re.sub( r'((?:\.js|\.css|\.woff2?|\.ttf|\.eot|\.svg|\.png|\.jpg|\.jpeg|\.webp|\.gif|\.ico))(?:%3F|\?)[^"\'&\s>)]+', r'\1', content, flags=re.IGNORECASE ) if content != original: html_path.write_text(content, encoding="utf-8") html_count += 1 except Exception as e: print(f" Fout HTML {html_path.name}: {e}") for css_path in SRC_DIR.rglob("*.css"): try: content = css_path.read_text(encoding="utf-8", errors="ignore") original = content content = re.sub( r'((?:\.js|\.css|\.woff2?|\.ttf|\.eot|\.svg|\.png|\.jpg|\.jpeg|\.webp|\.gif|\.ico))\?[^"\')\s;]+', r'\1', content, flags=re.IGNORECASE ) if content != original: css_path.write_text(content, encoding="utf-8") css_count += 1 except Exception as e: print(f" Fout CSS {css_path.name}: {e}") print(f" HTML bijgewerkt: {html_count} bestanden") print(f" CSS bijgewerkt: {css_count} bestanden") # ─── Stap 4: Fix absolute URLs → relatieve paden ────────────────────────────── def fix_absolute_urls(): DOMAINS = [ "https://www.correctvloerverwarming.nl", "http://www.correctvloerverwarming.nl", "https://correctvloerverwarming.nl", "http://correctvloerverwarming.nl", ] count = 0 for html_path in SRC_DIR.rglob("*.html"): try: content = html_path.read_text(encoding="utf-8", errors="ignore") original = content depth = len(html_path.relative_to(SRC_DIR).parts) - 1 prefix = "../" * depth if depth > 0 else "./" for domain in DOMAINS: content = content.replace(f'href="{domain}/', f'href="{prefix}') content = content.replace(f"href='{domain}/", f"href='{prefix}") content = content.replace(f'src="{domain}/', f'src="{prefix}') content = content.replace(f"src='{domain}/", f"src='{prefix}") content = content.replace(f'action="{domain}/', f'action="{prefix}') if content != original: html_path.write_text(content, encoding="utf-8") count += 1 except Exception as e: print(f" Fout URL fix {html_path.name}: {e}") print(f" Absolute URLs gefixed: {count} HTML bestanden") # ─── Stap 5: Sync naar webroot ───────────────────────────────────────────────── def sync_to_webroot(): if DST_DIR.exists(): shutil.rmtree(DST_DIR) shutil.copytree(SRC_DIR, DST_DIR) # Rechten instellen os.system(f"chown -R www-data:www-data {DST_DIR}") os.system(f"chmod -R 755 {DST_DIR}") total = sum(1 for _ in DST_DIR.rglob("*") if _.is_file()) print(f" {total} bestanden gesynchroniseerd naar {DST_DIR}") # ─── Hoofdprogramma ──────────────────────────────────────────────────────────── if __name__ == "__main__": print("\n[1/5] WordPress junk verwijderen...") delete_wp_junk() print("\n[2/5] Asset bestanden hernoemen...") rename_versioned_assets() print("\n[3/5] Asset referenties in HTML/CSS fixen...") fix_references() print("\n[4/5] Absolute URLs → relatieve paden...") fix_absolute_urls() print("\n[5/5] Sync naar webroot...") sync_to_webroot() print("\nKlaar! Site bereikbaar op https://correctvloerverwarming2.youztech.nl")