#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Generate sitemap.xml with real by probing each URL. - Tries HTTP HEAD Last-Modified header - Falls back to GET and parses common meta tags (og:updated_time, article:modified_time, itemprop, etc.) - Falls back to 'Date' header or today's date if nothing is available Outputs ISO 8601 date (YYYY-MM-DD). Usage: python generate_sitemap_lastmod.py > sitemap.xml Requirements: pip install requests beautifulsoup4 python-dateutil """ import sys import requests from bs4 import BeautifulSoup from datetime import datetime, timezone from dateutil import parser as dateparser # ---- CONFIG: your URLs with desired changefreq/priority ---- URLS = [ {"loc": "https://www.alma-voyages.com/", "changefreq": "daily", "priority": "1.0"}, {"loc": "https://www.alma-voyages.com/qui-sommes-nous/", "changefreq": "weekly", "priority": "0.8"}, {"loc": "https://www.alma-voyages.com/circuits/", "changefreq": "weekly", "priority": "0.8"}, {"loc": "https://www.alma-voyages.com/circuits-prives/", "changefreq": "weekly", "priority": "0.8"}, {"loc": "https://www.alma-voyages.com/autotours/", "changefreq": "weekly", "priority": "0.8"}, {"loc": "https://www.alma-voyages.com/croisieres/", "changefreq": "weekly", "priority": "0.8"}, {"loc": "https://www.alma-voyages.com/sejours-clubs/", "changefreq": "weekly", "priority": "0.8"}, {"loc": "https://www.alma-voyages.com/promos-voyages/", "changefreq": "weekly", "priority": "0.6"}, {"loc": "https://www.alma-voyages.com/voyage-promo-circuit-croisiere-aranui/", "changefreq": "weekly", "priority": "0.6"}, {"loc": "https://www.alma-voyages.com/voyages-antilles-caraibes/", "changefreq": "monthly","priority": "0.6"}, {"loc": "https://www.alma-voyages.com/voyages-ile-maurice/", "changefreq": "monthly", "priority": "0.6"}, {"loc": "https://www.alma-voyages.com/voyage-cuba/", "changefreq": "monthly", "priority": "0.6"}, {"loc": "https://www.alma-voyages.com/promo-voyages-bfm/", "changefreq": "monthly", "priority": "0.5"}, {"loc": "https://www.alma-voyages.com/conditions-generales-de-vente/", "changefreq": "monthly", "priority": "0.5"}, {"loc": "https://www.alma-voyages.com/mentions-legales/", "changefreq": "monthly", "priority": "0.5"}, {"loc": "https://www.alma-voyages.com/politique-confidentialite/", "changefreq": "monthly", "priority": "0.5"}, ] TIMEOUT = (10, 20) # (connect, read) seconds HEADERS = { "User-Agent": "SitemapLastmodBot/1.0 (+https://www.alma-voyages.com/)" } def to_iso_date(d: datetime) -> str: # Output as YYYY-MM-DD return d.astimezone(timezone.utc).date().isoformat() def parse_date_safe(value: str) -> datetime | None: try: dt = dateparser.parse(value) if dt is None: return None if dt.tzinfo is None: # Assume UTC if naive dt = dt.replace(tzinfo=timezone.utc) return dt except Exception: return None def extract_lastmod_from_html(html: str) -> datetime | None: soup = BeautifulSoup(html, "html.parser") # Common meta/date patterns candidates = [] # Open Graph / Article for prop in ["article:modified_time", "article:published_time", "og:updated_time", "og:published_time"]: tag = soup.find("meta", attrs={"property": prop}) if tag and tag.get("content"): candidates.append(tag["content"]) # Itemprop / name-based for name in ["lastmod", "lastmodified", "revised", "modified", "date", "dc.date.modified", "dc.date"]: tag = soup.find("meta", attrs={"name": name}) if tag and tag.get("content"): candidates.append(tag["content"]) # for t in soup.find_all("time"): dt_val = t.get("datetime") or t.get("content") if dt_val: candidates.append(dt_val) # Try to parse all candidates and return the most recent parsed = [parse_date_safe(c) for c in candidates] parsed = [p for p in parsed if p is not None] if parsed: return max(parsed) return None def fetch_lastmod(url: str) -> datetime: # 1) HEAD request try: r = requests.head(url, headers=HEADERS, allow_redirects=True, timeout=TIMEOUT) lm = r.headers.get("Last-Modified") if lm: dt = parse_date_safe(lm) if dt: return dt # Fallback to Date header as weak proxy date_hdr = r.headers.get("Date") if date_hdr: dt = parse_date_safe(date_hdr) if dt: weak_dt = dt else: weak_dt = None else: weak_dt = None except Exception: weak_dt = None # 2) GET request + parse meta tags try: r = requests.get(url, headers=HEADERS, allow_redirects=True, timeout=TIMEOUT) r.raise_for_status() # Prefer Last-Modified if present on GET lm = r.headers.get("Last-Modified") if lm: dt = parse_date_safe(lm) if dt: return dt meta_dt = extract_lastmod_from_html(r.text) if meta_dt: return meta_dt # Fallback to Date header on GET date_hdr = r.headers.get("Date") if date_hdr: dt = parse_date_safe(date_hdr) if dt: return dt except Exception: pass # 3) Ultimate fallback: use weak_dt (HEAD Date) or now() return weak_dt or datetime.now(timezone.utc) def make_sitemap(urls_with_meta: list[dict]) -> str: lines = [ '', '', "", ] for u in urls_with_meta: lastmod_dt = fetch_lastmod(u["loc"]) lastmod = to_iso_date(lastmod_dt) lines.append(" ") lines.append(f" {u['loc']}") lines.append(f" {lastmod}") if u.get("changefreq"): lines.append(f" {u['changefreq']}") if u.get("priority"): lines.append(f" {u['priority']}") lines.append(" \n") lines.append("\n") return "\n".join(lines) def main(): sitemap_xml = make_sitemap(URLS) # Print to stdout sys.stdout.write(sitemap_xml) # Save to file as well with open("sitemap.xml", "w", encoding="utf-8") as f: f.write(sitemap_xml) if __name__ == "__main__": main()