168 lines
6.4 KiB
Python
168 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Volledige cleanup en deploy van de wget-scrape:
|
|
1. Verwijder WordPress ?p=... en ?replytocom=... rommel
|
|
2. Hernoem assets: verwijder ?ver=... uit bestandsnaam
|
|
3. Update HTML/CSS referenties
|
|
4. Sync naar /var/www/correctvloerverwarming2/
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
SRC_DIR = Path("/home/anisy/projects/websites/correctvloerverwarming/correctvloerverwarming.nl")
|
|
DST_DIR = Path("/var/www/correctvloerverwarming2")
|
|
|
|
# ─── Stap 1: Verwijder WordPress preview rommel ────────────────────────────────
|
|
def delete_wp_junk():
|
|
deleted = 0
|
|
junk_patterns = [
|
|
r"\?p=\d+", # WordPress post previews
|
|
r"\?replytocom=", # Comment reply
|
|
r"wp-json", # WordPress JSON API (niet nodig voor statische site)
|
|
]
|
|
for f in list(SRC_DIR.rglob("*")):
|
|
name = f.name
|
|
if any(re.search(pat, name) for pat in junk_patterns):
|
|
if f.is_file():
|
|
f.unlink()
|
|
deleted += 1
|
|
elif f.is_dir():
|
|
shutil.rmtree(f)
|
|
deleted += 1
|
|
print(f" Verwijderd: {deleted} WordPress junk bestanden/folders")
|
|
|
|
# ─── Stap 2: Hernoem assets (verwijder ?ver=... / versie suffixen) ─────────────
|
|
def rename_versioned_assets():
|
|
renamed = 0
|
|
ASSET_EXTS = {'.js', '.css', '.woff', '.woff2', '.ttf', '.eot', '.svg', '.png',
|
|
'.jpg', '.jpeg', '.webp', '.gif', '.ico'}
|
|
|
|
for f in list(SRC_DIR.rglob("*")):
|
|
if not f.is_file():
|
|
continue
|
|
name = f.name
|
|
if '?' not in name:
|
|
continue
|
|
|
|
# Haal het schone deel vóór '?' op
|
|
clean_name = name.split('?')[0]
|
|
ext = Path(clean_name).suffix.lower()
|
|
|
|
# Alleen assets hernoemen, niet HTML pagina's met ?p=...
|
|
if ext not in ASSET_EXTS:
|
|
continue
|
|
|
|
new_path = f.parent / clean_name
|
|
if new_path.exists():
|
|
f.unlink() # duplicate, verwijder de versioned kopie
|
|
else:
|
|
f.rename(new_path)
|
|
renamed += 1
|
|
|
|
print(f" Hernoemd: {renamed} asset bestanden")
|
|
|
|
# ─── Stap 3: Fix referenties in HTML en CSS ────────────────────────────────────
|
|
def fix_references():
|
|
html_count = css_count = 0
|
|
|
|
for html_path in SRC_DIR.rglob("*.html"):
|
|
try:
|
|
content = html_path.read_text(encoding="utf-8", errors="ignore")
|
|
original = content
|
|
|
|
# Verwijder ?ver=... en %3Fver=... uit asset URLs
|
|
content = re.sub(
|
|
r'((?:\.js|\.css|\.woff2?|\.ttf|\.eot|\.svg|\.png|\.jpg|\.jpeg|\.webp|\.gif|\.ico))(?:%3F|\?)[^"\'&\s>)]+',
|
|
r'\1',
|
|
content,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
if content != original:
|
|
html_path.write_text(content, encoding="utf-8")
|
|
html_count += 1
|
|
except Exception as e:
|
|
print(f" Fout HTML {html_path.name}: {e}")
|
|
|
|
for css_path in SRC_DIR.rglob("*.css"):
|
|
try:
|
|
content = css_path.read_text(encoding="utf-8", errors="ignore")
|
|
original = content
|
|
|
|
content = re.sub(
|
|
r'((?:\.js|\.css|\.woff2?|\.ttf|\.eot|\.svg|\.png|\.jpg|\.jpeg|\.webp|\.gif|\.ico))\?[^"\')\s;]+',
|
|
r'\1',
|
|
content,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
if content != original:
|
|
css_path.write_text(content, encoding="utf-8")
|
|
css_count += 1
|
|
except Exception as e:
|
|
print(f" Fout CSS {css_path.name}: {e}")
|
|
|
|
print(f" HTML bijgewerkt: {html_count} bestanden")
|
|
print(f" CSS bijgewerkt: {css_count} bestanden")
|
|
|
|
# ─── Stap 4: Fix absolute URLs → relatieve paden ──────────────────────────────
|
|
def fix_absolute_urls():
|
|
DOMAINS = [
|
|
"https://www.correctvloerverwarming.nl",
|
|
"http://www.correctvloerverwarming.nl",
|
|
"https://correctvloerverwarming.nl",
|
|
"http://correctvloerverwarming.nl",
|
|
]
|
|
count = 0
|
|
for html_path in SRC_DIR.rglob("*.html"):
|
|
try:
|
|
content = html_path.read_text(encoding="utf-8", errors="ignore")
|
|
original = content
|
|
depth = len(html_path.relative_to(SRC_DIR).parts) - 1
|
|
prefix = "../" * depth if depth > 0 else "./"
|
|
for domain in DOMAINS:
|
|
content = content.replace(f'href="{domain}/', f'href="{prefix}')
|
|
content = content.replace(f"href='{domain}/", f"href='{prefix}")
|
|
content = content.replace(f'src="{domain}/', f'src="{prefix}')
|
|
content = content.replace(f"src='{domain}/", f"src='{prefix}")
|
|
content = content.replace(f'action="{domain}/', f'action="{prefix}')
|
|
if content != original:
|
|
html_path.write_text(content, encoding="utf-8")
|
|
count += 1
|
|
except Exception as e:
|
|
print(f" Fout URL fix {html_path.name}: {e}")
|
|
print(f" Absolute URLs gefixed: {count} HTML bestanden")
|
|
|
|
# ─── Stap 5: Sync naar webroot ─────────────────────────────────────────────────
|
|
def sync_to_webroot():
|
|
if DST_DIR.exists():
|
|
shutil.rmtree(DST_DIR)
|
|
shutil.copytree(SRC_DIR, DST_DIR)
|
|
# Rechten instellen
|
|
os.system(f"chown -R www-data:www-data {DST_DIR}")
|
|
os.system(f"chmod -R 755 {DST_DIR}")
|
|
total = sum(1 for _ in DST_DIR.rglob("*") if _.is_file())
|
|
print(f" {total} bestanden gesynchroniseerd naar {DST_DIR}")
|
|
|
|
# ─── Hoofdprogramma ────────────────────────────────────────────────────────────
|
|
if __name__ == "__main__":
|
|
print("\n[1/5] WordPress junk verwijderen...")
|
|
delete_wp_junk()
|
|
|
|
print("\n[2/5] Asset bestanden hernoemen...")
|
|
rename_versioned_assets()
|
|
|
|
print("\n[3/5] Asset referenties in HTML/CSS fixen...")
|
|
fix_references()
|
|
|
|
print("\n[4/5] Absolute URLs → relatieve paden...")
|
|
fix_absolute_urls()
|
|
|
|
print("\n[5/5] Sync naar webroot...")
|
|
sync_to_webroot()
|
|
|
|
print("\nKlaar! Site bereikbaar op https://correctvloerverwarming2.youztech.nl")
|