This script reads a local sitemap.xml
, extracts all the URLs, fetches the HTML content of each URL, and stores the data in a SQLite database for offline analysis and SEO audits.
settings.py
This file serves as the centralized configuration module used by all Python-based auditor scripts. It contains paths to the database, root directories, user-agent string, and timeout settings, making it easy to manage and reuse these values across multiple scripts.
# settings.py
# Common configuration for all auditor scripts
DB_PATH = r"E:\testing3\site_audit\auditor.db" # includes the database name
ROOT_PATH = r"E:\testing3\site_audit\\" # path to the directory
# Optional: You can also store other shared constants here
UA = "Mozilla/5.0 (compatible; Plus2net-BS4-Auditor/1.0)"
TIMEOUT = 30
# fetch_store_from_xml.py
# Fetch HTML from URLs listed in sitemap.xml and store them in SQLite.
import sqlite3
from datetime import datetime
import requests
import xml.etree.ElementTree as ET
from settings import DB_PATH, UA, TIMEOUT
# Path to sitemap.xml file
path1 = "C:\\xampp\\htdocs\\plus2net\\php_tutorial\\sitemap.xml"
# Extract URLs from sitemap
def extract_urls_from_sitemap(xml_path):
urls = []
try:
with open(xml_path, 'r', encoding='utf-8') as f:
tree = ET.parse(f)
root = tree.getroot()
ns = {'ns': 'https://www.sitemaps.org/schemas/sitemap/0.9'}
urls = [loc.text for loc in root.findall('.//ns:loc', ns)]
except Exception as e:
print("❌ Error reading sitemap:", e)
return urls
# Initialize SQLite database
def init_db():
con = sqlite3.connect(DB_PATH)
cur = con.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
fetched_at TEXT NOT NULL,
status_code INTEGER,
html TEXT
)
""")
con.commit()
con.close()
# Fetch HTML from a URL
def fetch_html(url: str) -> tuple[int, str]:
with requests.Session() as s:
s.headers.update({"User-Agent": UA})
r = s.get(url, timeout=TIMEOUT)
return r.status_code, r.text
# Insert or update HTML in database
def upsert_page(url: str, status: int, html: str) -> int:
con = sqlite3.connect(DB_PATH)
cur = con.cursor()
now = datetime.utcnow().isoformat(timespec="seconds") + "Z"
cur.execute("SELECT id FROM pages WHERE url = ?", (url,))
row = cur.fetchone()
if row:
page_id = row[0]
cur.execute(
"UPDATE pages SET fetched_at=?, status_code=?, html=? WHERE id=?",
(now, status, html, page_id)
)
else:
cur.execute(
"INSERT INTO pages (url, fetched_at, status_code, html) VALUES (?,?,?,?)",
(url, now, status, html)
)
page_id = cur.lastrowid
con.commit()
con.close()
return page_id
# Run the full process
if __name__ == "__main__":
init_db()
URLS = extract_urls_from_sitemap(path1)
print(f"🔎 Total URLs found: {len(URLS)}")
for url in URLS:
try:
status, html = fetch_html(url)
page_id = upsert_page(url, status, html)
print(f"[OK] Stored {url} (id={page_id}, status={status})")
except Exception as e:
print(f"[ERR] {url}: {e}")
import sqlite3
from settings import DB_PATH, ROOT_PATH # details from common file settings
con = sqlite3.connect(DB_PATH)
#query='SELECT URL FROM page_summary WHERE canonical_url != url' # SQL to collect all records from table
#query='DELETE FROM page_summary'
#query='DELETE FROM pages'
#query = 'SELECT URL FROM pages'
query = 'SELECT url,length(html) FROM pages ORDER BY LENGTH(html) limit 5,10'
#query = 'SELECT count(*) as no FROM Pages'
r_set=con.execute(query);
for row in r_set:
print(row)
con.commit()
con.close()
A sitemap.xml is an XML file listing all (or most) URLs on your website. It helps search engines discover and crawl your pages efficiently, including new or updated ones.
You can use Python’s built‑in XML libraries such as xml.etree.ElementTree
or external ones. Parse the XML, handle namespaces properly using a mapping like ns = {'ns': 'https://www.sitemaps.org/schemas/sitemap/0.9'}
, then find all <loc>
tags and get their text values.
Namespaces are common in sitemap files. Without using the correct namespace in your findall
XPath or search, Python may not locate elements like <loc>
. Include the namespace in the search, for example root.findall('.//ns:loc', ns)
.
You should run it whenever you add, remove or update pages — typically weekly or monthly. If your site changes frequently, automating this via cron or scheduler is helpful.
Author
🎥 Join me live on YouTubePassionate about coding and teaching, I publish practical tutorials on PHP, Python, JavaScript, SQL, and web development. My goal is to make learning simple, engaging, and project‑oriented with real examples and source code.