Read Sitemap.xml with Python and Extract URLs Easily

Fetch and Store HTML Pages from Sitemap using Python

This script reads a local sitemap.xml, extracts all the URLs, fetches the HTML content of each URL, and stores the data in a SQLite database for offline analysis and SEO audits.

Configuration File: `settings.py`

This file serves as the centralized configuration module used by all Python-based auditor scripts. It contains paths to the database, root directories, user-agent string, and timeout settings, making it easy to manage and reuse these values across multiple scripts.

# settings.py
# Common configuration for all auditor scripts

DB_PATH = r"E:\testing3\site_audit\auditor.db" # includes the database name
ROOT_PATH = r"E:\testing3\site_audit\\"          # path to the directory

# Optional: You can also store other shared constants here
UA = "Mozilla/5.0 (compatible; Plus2net-BS4-Auditor/1.0)"
TIMEOUT = 30

Update the path to your SQLite database file in above code / settings.
Update the path to your sitemap.xml page to get the list of URLs here.
Full code is here.

# fetch_store_from_xml.py
# Fetch HTML from URLs listed in sitemap.xml and store them in SQLite.

import sqlite3
from datetime import datetime
import requests
import xml.etree.ElementTree as ET
from settings import DB_PATH, UA, TIMEOUT

# Path to sitemap.xml file
path1 = "C:\\xampp\\htdocs\\plus2net\\php_tutorial\\sitemap.xml"

# Extract URLs from sitemap
def extract_urls_from_sitemap(xml_path):
    urls = []
    try:
        with open(xml_path, 'r', encoding='utf-8') as f:
            tree = ET.parse(f)
            root = tree.getroot()
            ns = {'ns': 'https://www.sitemaps.org/schemas/sitemap/0.9'}
            urls = [loc.text for loc in root.findall('.//ns:loc', ns)]
    except Exception as e:
        print("❌ Error reading sitemap:", e)
    return urls

# Initialize SQLite database
def init_db():
    con = sqlite3.connect(DB_PATH)
    cur = con.cursor()
    cur.execute("""
    CREATE TABLE IF NOT EXISTS pages (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        url TEXT UNIQUE NOT NULL,
        fetched_at TEXT NOT NULL,
        status_code INTEGER,
        html TEXT
    )
    """)
    con.commit()
    con.close()

# Fetch HTML from a URL
def fetch_html(url: str) -> tuple[int, str]:
    with requests.Session() as s:
        s.headers.update({"User-Agent": UA})
        r = s.get(url, timeout=TIMEOUT)
        return r.status_code, r.text

# Insert or update HTML in database
def upsert_page(url: str, status: int, html: str) -> int:
    con = sqlite3.connect(DB_PATH)
    cur = con.cursor()
    now = datetime.utcnow().isoformat(timespec="seconds") + "Z"
    cur.execute("SELECT id FROM pages WHERE url = ?", (url,))
    row = cur.fetchone()
    if row:
        page_id = row[0]
        cur.execute(
            "UPDATE pages SET fetched_at=?, status_code=?, html=? WHERE id=?",
            (now, status, html, page_id)
        )
    else:
        cur.execute(
            "INSERT INTO pages (url, fetched_at, status_code, html) VALUES (?,?,?,?)",
            (url, now, status, html)
        )
        page_id = cur.lastrowid
    con.commit()
    con.close()
    return page_id

# Run the full process
if __name__ == "__main__":
    init_db()
    URLS = extract_urls_from_sitemap(path1)
    print(f"🔎 Total URLs found: {len(URLS)}")

    for url in URLS:
        try:
            status, html = fetch_html(url)
            page_id = upsert_page(url, status, html)
            print(f"[OK] Stored {url} (id={page_id}, status={status})")
        except Exception as e:
            print(f"[ERR] {url}: {e}")

Reading from SQLite database.

import sqlite3

from settings import DB_PATH, ROOT_PATH # details from common file settings
con = sqlite3.connect(DB_PATH)

#query='SELECT URL FROM page_summary WHERE canonical_url != url' # SQL to collect all records from table
#query='DELETE FROM page_summary'
#query='DELETE FROM pages'
#query = 'SELECT URL FROM pages'
query = 'SELECT url,length(html) FROM pages ORDER BY LENGTH(html) limit 5,10'
#query = 'SELECT count(*) as no FROM Pages'
r_set=con.execute(query);
for row in r_set:
    print(row)
con.commit()
con.close()

FAQs about Reading sitemap.xml with Python

What is sitemap.xml and why is it important?

A sitemap.xml is an XML file listing all (or most) URLs on your website. It helps search engines discover and crawl your pages efficiently, including new or updated ones.

How can I extract URLs from sitemap.xml using Python?

You can use Python’s built‑in XML libraries such as xml.etree.ElementTree or external ones. Parse the XML, handle namespaces properly using a mapping like ns = {'ns': 'https://www.sitemaps.org/schemas/sitemap/0.9'}, then find all <loc> tags and get their text values.

What if the sitemap includes namespace declarations?

Namespaces are common in sitemap files. Without using the correct namespace in your findall XPath or search, Python may not locate elements like <loc>. Include the namespace in the search, for example root.findall('.//ns:loc', ns).

How often should I read or re‑run the sitemap extractor?

You should run it whenever you add, remove or update pages — typically weekly or monthly. If your site changes frequently, automating this via cron or scheduler is helpful.

Creating Sitemap
Checking presence of Google analytics GA-4 tag in web pages
Audit a webpage by using BS4 for SEO & AI Readiness

File handling bs4 Python- Tutorials

Subhendu Mohapatra

Author

🎥 Join me live on YouTube

Passionate about coding and teaching, I publish practical tutorials on PHP, Python, JavaScript, SQL, and web development. My goal is to make learning simple, engaging, and project‑oriented with real examples and source code.