<?xml version="1.0" encoding="UTf-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" 
                   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
                   xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 
                   http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Generate sitemap.xml with real <lastmod> by probing each URL.
- Tries HTTP HEAD Last-Modified header
- Falls back to GET and parses common meta tags (og:updated_time, article:modified_time, itemprop, etc.)
- Falls back to 'Date' header or today's date if nothing is available
Outputs ISO 8601 date (YYYY-MM-DD).

Usage:
  python generate_sitemap_lastmod.py > sitemap.xml

Requirements:
  pip install requests beautifulsoup4 python-dateutil
"""

import sys
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from dateutil import parser as dateparser

# ---- CONFIG: your URLs with desired changefreq/priority ----
URLS = [
    {"loc": "https://www.alma-voyages.com/", "changefreq": "daily",   "priority": "1.0"},
    {"loc": "https://www.alma-voyages.com/qui-sommes-nous/",          "changefreq": "weekly",  "priority": "0.8"},
    {"loc": "https://www.alma-voyages.com/circuits/",                 "changefreq": "weekly",  "priority": "0.8"},
    {"loc": "https://www.alma-voyages.com/circuits-prives/",          "changefreq": "weekly",  "priority": "0.8"},
    {"loc": "https://www.alma-voyages.com/autotours/",                "changefreq": "weekly",  "priority": "0.8"},
    {"loc": "https://www.alma-voyages.com/croisieres/",               "changefreq": "weekly",  "priority": "0.8"},
    {"loc": "https://www.alma-voyages.com/sejours-clubs/",            "changefreq": "weekly",  "priority": "0.8"},
    {"loc": "https://www.alma-voyages.com/promos-voyages/",           "changefreq": "weekly",  "priority": "0.6"},
    {"loc": "https://www.alma-voyages.com/voyage-promo-circuit-croisiere-aranui/", "changefreq": "weekly",  "priority": "0.6"},
    {"loc": "https://www.alma-voyages.com/voyages-antilles-caraibes/", "changefreq": "monthly","priority": "0.6"},
    {"loc": "https://www.alma-voyages.com/voyages-ile-maurice/",      "changefreq": "monthly", "priority": "0.6"},
    {"loc": "https://www.alma-voyages.com/voyage-cuba/",              "changefreq": "monthly", "priority": "0.6"},
    {"loc": "https://www.alma-voyages.com/promo-voyages-bfm/",        "changefreq": "monthly", "priority": "0.5"},
    {"loc": "https://www.alma-voyages.com/conditions-generales-de-vente/", "changefreq": "monthly", "priority": "0.5"},
    {"loc": "https://www.alma-voyages.com/mentions-legales/",         "changefreq": "monthly", "priority": "0.5"},
    {"loc": "https://www.alma-voyages.com/politique-confidentialite/", "changefreq": "monthly", "priority": "0.5"},
]

TIMEOUT = (10, 20)  # (connect, read) seconds
HEADERS = {
    "User-Agent": "SitemapLastmodBot/1.0 (+https://www.alma-voyages.com/)"
}

def to_iso_date(d: datetime) -> str:
    # Output as YYYY-MM-DD
    return d.astimezone(timezone.utc).date().isoformat()

def parse_date_safe(value: str) -> datetime | None:
    try:
        dt = dateparser.parse(value)
        if dt is None:
            return None
        if dt.tzinfo is None:
            # Assume UTC if naive
            dt = dt.replace(tzinfo=timezone.utc)
        return dt
    except Exception:
        return None

def extract_lastmod_from_html(html: str) -> datetime | None:
    soup = BeautifulSoup(html, "html.parser")
    # Common meta/date patterns
    candidates = []

    # Open Graph / Article
    for prop in ["article:modified_time", "article:published_time", "og:updated_time", "og:published_time"]:
        tag = soup.find("meta", attrs={"property": prop})
        if tag and tag.get("content"):
            candidates.append(tag["content"])

    # Itemprop / name-based
    for name in ["lastmod", "lastmodified", "revised", "modified", "date", "dc.date.modified", "dc.date"]:
        tag = soup.find("meta", attrs={"name": name})
        if tag and tag.get("content"):
            candidates.append(tag["content"])

    # <time datetime="...">
    for t in soup.find_all("time"):
        dt_val = t.get("datetime") or t.get("content")
        if dt_val:
            candidates.append(dt_val)

    # Try to parse all candidates and return the most recent
    parsed = [parse_date_safe(c) for c in candidates]
    parsed = [p for p in parsed if p is not None]
    if parsed:
        return max(parsed)
    return None

def fetch_lastmod(url: str) -> datetime:
    # 1) HEAD request
    try:
        r = requests.head(url, headers=HEADERS, allow_redirects=True, timeout=TIMEOUT)
        lm = r.headers.get("Last-Modified")
        if lm:
            dt = parse_date_safe(lm)
            if dt:
                return dt
        # Fallback to Date header as weak proxy
        date_hdr = r.headers.get("Date")
        if date_hdr:
            dt = parse_date_safe(date_hdr)
            if dt:
                weak_dt = dt
            else:
                weak_dt = None
        else:
            weak_dt = None
    except Exception:
        weak_dt = None

    # 2) GET request + parse meta tags
    try:
        r = requests.get(url, headers=HEADERS, allow_redirects=True, timeout=TIMEOUT)
        r.raise_for_status()
        # Prefer Last-Modified if present on GET
        lm = r.headers.get("Last-Modified")
        if lm:
            dt = parse_date_safe(lm)
            if dt:
                return dt

        meta_dt = extract_lastmod_from_html(r.text)
        if meta_dt:
            return meta_dt

        # Fallback to Date header on GET
        date_hdr = r.headers.get("Date")
        if date_hdr:
            dt = parse_date_safe(date_hdr)
            if dt:
                return dt
    except Exception:
        pass

    # 3) Ultimate fallback: use weak_dt (HEAD Date) or now()
    return weak_dt or datetime.now(timezone.utc)

def make_sitemap(urls_with_meta: list[dict]) -> str:
    lines = [
        '<?xml version="1.0" encoding="UTF-8"?>',
        '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"',
        '        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"',
        '        xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">',
        "",
    ]
    for u in urls_with_meta:
        lastmod_dt = fetch_lastmod(u["loc"])
        lastmod = to_iso_date(lastmod_dt)
        lines.append("  <url>")
        lines.append(f"    <loc>{u['loc']}</loc>")
        lines.append(f"    <lastmod>{lastmod}</lastmod>")
        if u.get("changefreq"):
            lines.append(f"    <changefreq>{u['changefreq']}</changefreq>")
        if u.get("priority"):
            lines.append(f"    <priority>{u['priority']}</priority>")
        lines.append("  </url>\n")
    lines.append("</urlset>\n")
    return "\n".join(lines)

def main():
    sitemap_xml = make_sitemap(URLS)
    # Print to stdout
    sys.stdout.write(sitemap_xml)
    # Save to file as well
    with open("sitemap.xml", "w", encoding="utf-8") as f:
        f.write(sitemap_xml)

if __name__ == "__main__":
    main()

</urlset>