Want to quickly check if a webpage has all the essential SEO and AI-friendly elements without using paid tools or API keys? In this guide, we’ll build and run a Python-based webpage auditor using BeautifulSoup and requests. The script works offline, requires no authentication, and returns practical, actionable results.
Many SEO audit tools are paid or have API limits. This simple_auditor.py script:
title, description, keywords) case-insensitively.alt attribute coverage.robots.txt.Make sure you have the following installed:
pip install requests beautifulsoup4 lxml
simple_auditor.pyHere’s the full working script. Change url_to_audit to the webpage you want to check.
# simple_auditor.py
# Minimal, API-key-free webpage auditor with case-insensitive meta extraction.
# Change `url_to_audit` to target another page.
import re
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
# ====== Change this URL to audit another page ======
url_to_audit = "https://www.plus2net.com/python/pandas.php"
UA = "Mozilla/5.0 (compatible; AI-Ready-Auditor/1.1; +https://example.com)"
TIMEOUT = 30
def fetch_html(url: str) -> tuple[int, str]:
r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT)
return r.status_code, r.text
def get_meta_by_name(soup: BeautifulSoup, meta_name: str):
"""
Case-insensitive fetch of content.
Matches NAME="DESCRIPTION" as well as name="description".
"""
tag = soup.find("meta", attrs={"name": re.compile(rf"^{re.escape(meta_name)}$", re.I)})
return (tag.get("content") or "").strip() if tag and tag.get("content") else None
def get_meta_by_property(soup: BeautifulSoup, prop_name: str):
"""
Case-insensitive fetch of content (e.g., og:description).
"""
tag = soup.find("meta", attrs={"property": re.compile(rf"^{re.escape(prop_name)}$", re.I)})
return (tag.get("content") or "").strip() if tag and tag.get("content") else None
def extract_schema_flags(json_ld_blocks: list[str]):
"""
Simple presence checks for common schema types used in articles.
"""
j = "\n".join(json_ld_blocks)
has_faq = "FAQPage" in j
has_howto = "HowTo" in j
has_video = "VideoObject" in j
return has_faq, has_howto, has_video
def analyze(url: str):
status, html = fetch_html(url)
soup = BeautifulSoup(html, "lxml")
# ---- Metadata (case-insensitive for meta name/property values) ----
title = soup.title.string.strip() if soup.title and soup.title.string else None
# Description: name=description (any case) with fallback to og:description
meta_description = get_meta_by_name(soup, "description") or get_meta_by_property(soup, "og:description")
meta_description_length = len(meta_description) if meta_description else None
# Keywords (optional, informational)
meta_keywords = get_meta_by_name(soup, "keywords")
# Viewport (sanity check for mobile friendliness)
meta_viewport = get_meta_by_name(soup, "viewport")
# ---- Headings ----
h1_tag = soup.find("h1")
h1 = h1_tag.get_text(strip=True) if h1_tag else None
# ---- Canonical (case-insensitive rel) ----
canonical_tag = soup.find("link", rel=lambda v: v and v.lower() == "canonical")
canonical_url = canonical_tag.get("href") if canonical_tag else None
# ---- GA4 (G-XXXXXXXX) ----
ga4_ids = list(sorted(set(re.findall(r"G-[A-Z0-9]{6,12}", html))))
has_ga4 = bool(ga4_ids)
# ---- Open Graph / Twitter (case-insensitive) ----
og_tags = {
t.get("property"): t.get("content")
for t in soup.find_all("meta", attrs={"property": re.compile(r"^og:", re.I)})
if t.get("property") and t.get("content")
}
twitter_tags = {
t.get("name"): t.get("content")
for t in soup.find_all("meta", attrs={"name": re.compile(r"^twitter:", re.I)})
if t.get("name") and t.get("content")
}
# ---- Images / alt coverage ----
imgs = soup.find_all("img")
images_total = len(imgs)
images_with_alt = sum(1 for i in imgs if (i.get("alt") or "").strip())
images_alt_coverage_pct = round((images_with_alt / images_total * 100), 2) if images_total else 0.0
# ---- Links (internal vs external) ----
parsed = urlparse(url)
base_host = parsed.netloc
internal_links, external_links = 0, 0
for a in soup.find_all("a", href=True):
href = a.get("href")
p = urlparse(href)
if not p.netloc or p.netloc == "" or p.netloc == base_host:
internal_links += 1
else:
external_links += 1
# ---- Schema JSON-LD presence ----
json_ld_blocks = [t.get_text() for t in soup.find_all("script", type="application/ld+json")]
has_faq_schema, has_howto_schema, has_video_schema = extract_schema_flags(json_ld_blocks)
# ---- Breadcrumbs ----
breadcrumbs_present = bool(soup.select('[itemtype*="BreadcrumbList" i], nav.breadcrumb, ol.breadcrumb'))
# ---- robots.txt & sitemap hint ----
root = f"{parsed.scheme}://{parsed.netloc}"
sitemap_present = False
try:
rr = requests.get(f"{root}/robots.txt", headers={"User-Agent": UA}, timeout=10)
if rr.ok and "sitemap" in rr.text.lower():
sitemap_present = True
except Exception:
pass
return {
"url": url,
"http_status": status,
"title": title,
"title_length": len(title) if title else None,
"meta_description": meta_description,
"meta_description_length": meta_description_length,
"meta_keywords": meta_keywords,
"meta_viewport_present": bool(meta_viewport),
"h1": h1,
"canonical_url": canonical_url,
"ga4_ids": ga4_ids,
"og_tags_present": bool(og_tags),
"twitter_tags_present": bool(twitter_tags),
"images_total": images_total,
"images_with_alt": images_with_alt,
"images_alt_coverage_pct": images_alt_coverage_pct,
"internal_links": internal_links,
"external_links": external_links,
"has_faq_schema": has_faq_schema,
"has_howto_schema": has_howto_schema,
"has_videoobject_schema": has_video_schema,
"breadcrumbs_present": breadcrumbs_present,
"sitemap_present": sitemap_present
}
def print_summary(r: dict):
print("\n=== AI-Ready Webpage Audit Summary ===")
print("URL:", r["url"])
print("HTTP Status:", r["http_status"])
# Title / Meta
print(f"Title ({r['title_length']} chars):", "Present" if r["title"] else "Missing")
print(
f"Meta Description ({r['meta_description_length']} chars):",
"Present" if r["meta_description"] else "Missing"
)
print("Meta Keywords:", "Present" if r.get("meta_keywords") else "Missing")
print("Viewport Meta:", "Present" if r.get("meta_viewport_present") else "Missing")
# Structure
print("H1:", "Present" if r["h1"] else "Missing")
print("Canonical URL:", r["canonical_url"] or "Missing")
# Tracking / Social
print("GA4 IDs:", r["ga4_ids"] if r["ga4_ids"] else "None Found")
print("OpenGraph Tags:", r["og_tags_present"])
print("Twitter Tags:", r["twitter_tags_present"])
# Media / Links
print(f"Images with alt: {r['images_with_alt']}/{r['images_total']} ({r['images_alt_coverage_pct']}%)")
print("Internal Links:", r["internal_links"], "| External Links:", r["external_links"])
# Schema / Nav
print("FAQ Schema:", r["has_faq_schema"])
print("HowTo Schema:", r["has_howto_schema"])
print("VideoObject Schema:", r["has_videoobject_schema"])
print("Breadcrumbs:", r["breadcrumbs_present"])
print("Sitemap in robots.txt:", r["sitemap_present"])
print("======================================\n")
if __name__ == "__main__":
report = analyze(url_to_audit)
print_summary(report)
requests library.BeautifulSoup to find tags case-insensitively.
=== AI-Ready Webpage Audit Summary ===
URL: https://www.plus2net.com/python/pandas.php
HTTP Status: 200
Title (38 chars): Present
Meta Description (45 chars): Present
Meta Keywords: Present
Viewport Meta: Present
H1: Present
Canonical URL: https://www.plus2net.com/python/pandas.php
GA4 IDs: ['G-DXKVCW4XVG']
OpenGraph Tags: True
Twitter Tags: True
Images with alt: 5/5 (100.0%)
Internal Links: 12 | External Links: 3
FAQ Schema: False
HowTo Schema: False
VideoObject Schema: False
Breadcrumbs: True
Sitemap in robots.txt: True
======================================
url_to_audit at the top of the script.analyze() function for Lighthouse scores, Core Web Vitals, or custom regex patterns.print_summary(), save results to JSON for later analysis.Search engines and AI content generators (like Google’s AI Overviews or NotebookLM) rely heavily on structured metadata and clean HTML to generate summaries and previews. By running this audit, you ensure your page is ready for:
This simple_auditor.py script gives you a fast, offline way to check a page’s SEO and AI-readiness without depending on third-party APIs or rate limits. By fixing the missing elements it highlights, you improve your content’s visibility for both search engines and AI tools.
You can copy the full code above, save it as simple_auditor.py, and run:
python simple_auditor.py
Then update url_to_audit for different pages.
Want to take your BeautifulSoup auditing to the next level? Check out our Python SQLite Webpage Auditor that stores audit data in a database, allows deeper analysis, and exports reports to Excel for powerful SEO insights.
Learn More →
Author
🎥 Join me live on YouTubePassionate about coding and teaching, I publish practical tutorials on PHP, Python, JavaScript, SQL, and web development. My goal is to make learning simple, engaging, and project‑oriented with real examples and source code.