Want to quickly check if a webpage has all the essential SEO and AI-friendly elements without using paid tools or API keys? In this guide, we’ll build and run a Python-based webpage auditor using BeautifulSoup
and requests
. The script works offline, requires no authentication, and returns practical, actionable results.
Many SEO audit tools are paid or have API limits. This simple_auditor.py script:
title
, description
, keywords
) case-insensitively.alt
attribute coverage.robots.txt
.Make sure you have the following installed:
pip install requests beautifulsoup4 lxml
simple_auditor.py
Here’s the full working script. Change url_to_audit
to the webpage you want to check.
# simple_auditor.py # Minimal, API-key-free webpage auditor with case-insensitive meta extraction. # Change `url_to_audit` to target another page. import re import requests from urllib.parse import urlparse from bs4 import BeautifulSoup # ====== Change this URL to audit another page ====== url_to_audit = "https://www.plus2net.com/python/pandas.php" UA = "Mozilla/5.0 (compatible; AI-Ready-Auditor/1.1; +https://example.com)" TIMEOUT = 30 def fetch_html(url: str) -> tuple[int, str]: r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT) return r.status_code, r.text def get_meta_by_name(soup: BeautifulSoup, meta_name: str): """ Case-insensitive fetch of content. Matches NAME="DESCRIPTION" as well as name="description". """ tag = soup.find("meta", attrs={"name": re.compile(rf"^{re.escape(meta_name)}$", re.I)}) return (tag.get("content") or "").strip() if tag and tag.get("content") else None def get_meta_by_property(soup: BeautifulSoup, prop_name: str): """ Case-insensitive fetch of content (e.g., og:description). """ tag = soup.find("meta", attrs={"property": re.compile(rf"^{re.escape(prop_name)}$", re.I)}) return (tag.get("content") or "").strip() if tag and tag.get("content") else None def extract_schema_flags(json_ld_blocks: list[str]): """ Simple presence checks for common schema types used in articles. """ j = "\n".join(json_ld_blocks) has_faq = "FAQPage" in j has_howto = "HowTo" in j has_video = "VideoObject" in j return has_faq, has_howto, has_video def analyze(url: str): status, html = fetch_html(url) soup = BeautifulSoup(html, "lxml") # ---- Metadata (case-insensitive for meta name/property values) ---- title = soup.title.string.strip() if soup.title and soup.title.string else None # Description: name=description (any case) with fallback to og:description meta_description = get_meta_by_name(soup, "description") or get_meta_by_property(soup, "og:description") meta_description_length = len(meta_description) if meta_description else None # Keywords (optional, informational) meta_keywords = get_meta_by_name(soup, "keywords") # Viewport (sanity check for mobile friendliness) meta_viewport = get_meta_by_name(soup, "viewport") # ---- Headings ---- h1_tag = soup.find("h1") h1 = h1_tag.get_text(strip=True) if h1_tag else None # ---- Canonical (case-insensitive rel) ---- canonical_tag = soup.find("link", rel=lambda v: v and v.lower() == "canonical") canonical_url = canonical_tag.get("href") if canonical_tag else None # ---- GA4 (G-XXXXXXXX) ---- ga4_ids = list(sorted(set(re.findall(r"G-[A-Z0-9]{6,12}", html)))) has_ga4 = bool(ga4_ids) # ---- Open Graph / Twitter (case-insensitive) ---- og_tags = { t.get("property"): t.get("content") for t in soup.find_all("meta", attrs={"property": re.compile(r"^og:", re.I)}) if t.get("property") and t.get("content") } twitter_tags = { t.get("name"): t.get("content") for t in soup.find_all("meta", attrs={"name": re.compile(r"^twitter:", re.I)}) if t.get("name") and t.get("content") } # ---- Images / alt coverage ---- imgs = soup.find_all("img") images_total = len(imgs) images_with_alt = sum(1 for i in imgs if (i.get("alt") or "").strip()) images_alt_coverage_pct = round((images_with_alt / images_total * 100), 2) if images_total else 0.0 # ---- Links (internal vs external) ---- parsed = urlparse(url) base_host = parsed.netloc internal_links, external_links = 0, 0 for a in soup.find_all("a", href=True): href = a.get("href") p = urlparse(href) if not p.netloc or p.netloc == "" or p.netloc == base_host: internal_links += 1 else: external_links += 1 # ---- Schema JSON-LD presence ---- json_ld_blocks = [t.get_text() for t in soup.find_all("script", type="application/ld+json")] has_faq_schema, has_howto_schema, has_video_schema = extract_schema_flags(json_ld_blocks) # ---- Breadcrumbs ---- breadcrumbs_present = bool(soup.select('[itemtype*="BreadcrumbList" i], nav.breadcrumb, ol.breadcrumb')) # ---- robots.txt & sitemap hint ---- root = f"{parsed.scheme}://{parsed.netloc}" sitemap_present = False try: rr = requests.get(f"{root}/robots.txt", headers={"User-Agent": UA}, timeout=10) if rr.ok and "sitemap" in rr.text.lower(): sitemap_present = True except Exception: pass return { "url": url, "http_status": status, "title": title, "title_length": len(title) if title else None, "meta_description": meta_description, "meta_description_length": meta_description_length, "meta_keywords": meta_keywords, "meta_viewport_present": bool(meta_viewport), "h1": h1, "canonical_url": canonical_url, "ga4_ids": ga4_ids, "og_tags_present": bool(og_tags), "twitter_tags_present": bool(twitter_tags), "images_total": images_total, "images_with_alt": images_with_alt, "images_alt_coverage_pct": images_alt_coverage_pct, "internal_links": internal_links, "external_links": external_links, "has_faq_schema": has_faq_schema, "has_howto_schema": has_howto_schema, "has_videoobject_schema": has_video_schema, "breadcrumbs_present": breadcrumbs_present, "sitemap_present": sitemap_present } def print_summary(r: dict): print("\n=== AI-Ready Webpage Audit Summary ===") print("URL:", r["url"]) print("HTTP Status:", r["http_status"]) # Title / Meta print(f"Title ({r['title_length']} chars):", "Present" if r["title"] else "Missing") print( f"Meta Description ({r['meta_description_length']} chars):", "Present" if r["meta_description"] else "Missing" ) print("Meta Keywords:", "Present" if r.get("meta_keywords") else "Missing") print("Viewport Meta:", "Present" if r.get("meta_viewport_present") else "Missing") # Structure print("H1:", "Present" if r["h1"] else "Missing") print("Canonical URL:", r["canonical_url"] or "Missing") # Tracking / Social print("GA4 IDs:", r["ga4_ids"] if r["ga4_ids"] else "None Found") print("OpenGraph Tags:", r["og_tags_present"]) print("Twitter Tags:", r["twitter_tags_present"]) # Media / Links print(f"Images with alt: {r['images_with_alt']}/{r['images_total']} ({r['images_alt_coverage_pct']}%)") print("Internal Links:", r["internal_links"], "| External Links:", r["external_links"]) # Schema / Nav print("FAQ Schema:", r["has_faq_schema"]) print("HowTo Schema:", r["has_howto_schema"]) print("VideoObject Schema:", r["has_videoobject_schema"]) print("Breadcrumbs:", r["breadcrumbs_present"]) print("Sitemap in robots.txt:", r["sitemap_present"]) print("======================================\n") if __name__ == "__main__": report = analyze(url_to_audit) print_summary(report)
requests
library.BeautifulSoup
to find tags case-insensitively.
=== AI-Ready Webpage Audit Summary ===
URL: https://www.plus2net.com/python/pandas.php
HTTP Status: 200
Title (38 chars): Present
Meta Description (45 chars): Present
Meta Keywords: Present
Viewport Meta: Present
H1: Present
Canonical URL: https://www.plus2net.com/python/pandas.php
GA4 IDs: ['G-DXKVCW4XVG']
OpenGraph Tags: True
Twitter Tags: True
Images with alt: 5/5 (100.0%)
Internal Links: 12 | External Links: 3
FAQ Schema: False
HowTo Schema: False
VideoObject Schema: False
Breadcrumbs: True
Sitemap in robots.txt: True
======================================
url_to_audit
at the top of the script.analyze()
function for Lighthouse scores, Core Web Vitals, or custom regex patterns.print_summary()
, save results to JSON for later analysis.Search engines and AI content generators (like Google’s AI Overviews or NotebookLM) rely heavily on structured metadata and clean HTML to generate summaries and previews. By running this audit, you ensure your page is ready for:
This simple_auditor.py
script gives you a fast, offline way to check a page’s SEO and AI-readiness without depending on third-party APIs or rate limits. By fixing the missing elements it highlights, you improve your content’s visibility for both search engines and AI tools.
You can copy the full code above, save it as simple_auditor.py
, and run:
python simple_auditor.py
Then update url_to_audit
for different pages.
Want to take your BeautifulSoup auditing to the next level? Check out our Python SQLite Webpage Auditor that stores audit data in a database, allows deeper analysis, and exports reports to Excel for powerful SEO insights.
Learn More →Author
🎥 Join me live on YouTubePassionate about coding and teaching, I publish practical tutorials on PHP, Python, JavaScript, SQL, and web development. My goal is to make learning simple, engaging, and project‑oriented with real examples and source code.