This script scans specified directories for PHP files, extracts metadata, and updates both an Excel file and a MySQL database with the collected information. Additionally, it generates an updated sitemap.xml file for search engine optimization.
This Python script scans PHP files inside multiple website directories. It extracts metadata, saves the info into an Excel file and a SQLite database, and generates a sitemap.xml file for each section. It is ideal for webmasters automating large site audits or managing sitemaps.
🔹 Import Required Libraries and Setup
This section imports necessary Python modules and establishes a database connection.
import os
from datetime import datetime
import pandas as pd
from sqlalchemy import create_engine, Float
# SQLite connection (MySQL option commented)
my_conn = create_engine("sqlite:///F:\\testing2\\sitemap\\data\\plus2net.db")
my_conn = my_conn.connect()
🔹 Set Directory List and Paths
This block defines directories to scan and the local paths for reading PHP files and saving Excel output.
l1 = ['python']
path1 = "C:\\xampp\\htdocs\\z-plus2net\\"
f_x1 = 'F:\\testing2\\sitemap\\data\\'
🔹 Loop Through Files and Collect Metadata
This core section reads all `.php` files, checks for tags like canonical, breadcrumb, etc., and collects size and last modified date.
for d in l1:
path = path1 + d + "\\"
f_x = f_x1 + d + '.xlsx'
files = os.listdir(path)
df = pd.DataFrame(columns=[...])
for f in files:
f_name, f_extension = os.path.splitext(path + f)
if f_extension == '.php':
size = os.path.getsize(path + f)
t_stamp = os.path.getmtime(path + f)
dt_mod = datetime.fromtimestamp(t_stamp)
m_date = datetime.strftime(dt_mod, '%Y-%m-%d')
🔹 Check for Tags in HTML
This logic reads each file and checks for presence of specific SEO-related tags like `<head>`, `canonical`, and `breadcrumb`.
fob = open(path + f, 'r', encoding='utf8', errors='ignore')
data = fob.read()
tag_head = 0 if data.find('</head>') < 0 else 1
🔹 Save Data to Excel and Database
All metadata is saved to an Excel file and also stored into a SQLite database table using SQLAlchemy.
df.to_excel(f_x, index=False)
df.to_sql(con=my_conn, name=d, if_exists='replace', index=False, ...)
🔹 Generate Sitemap.xml from Excel
Reads the Excel again, applies filters (like `tag_head!=0`), and generates an XML-compliant sitemap file.
for d in l1:
df = pd.read_excel(f_x)
df = df.loc[df['tag_head'] != 0]
my_str = '<?xml version="1.0" encoding="UTF-8"?>\n'
my_str += '<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">\n'
def print_row(row):
global my_str
my_str += f"<url><loc>{row['url']}</loc><lastmod>{row['dt']}</lastmod></url>\n"
df.apply(print_row, axis=1)
my_str += "</urlset>"
with open(path + "sitemap.xml", 'w') as file:
file.write(my_str)
## this will update the mysql and sitemap inside excel page list.
import os
from datetime import datetime
import pandas as pd
from sqlalchemy import create_engine,Float
#my_conn = create_engine("mysql+mysqldb://id:pw@localhost/my_tutorial") # MySQL
my_conn = create_engine("sqlite:///F:\\testing2\\sitemap\\data\\plus2net.db") #
my_conn=my_conn.connect() # add this line if error
# source directory list to get list of files inside the directory
l1=['javascript_tutorial','php_tutorial','html_tutorial',
'sql_tutorial','python','jquery','c-tutorial','articles'
'asp-tutorial','ps-tutorial','java_tutorial','angular','msg-demo']
#l1=['c-tutorial'] # comment this line to update all directories
l1=['python']
#path1="C:\\xampp\\htdocs\\plus2net\\"
path1 = "C:\\xampp\\htdocs\\z-plus2net\\" # full path to directory
f_x1='F:\\testing2\\sitemap\\data\\' # Excel fle path to store
###### End of Edit #############
for d in l1:
path = path1 +d+"\\" # full path to directory
f_x=f_x1+d+'.xlsx' # excel file to store, file name is the directory name
files=os.listdir(path) # List of files in the directory
df = pd.DataFrame(columns = ['url','f_name', 'dt', 'size','tag_nav','Sp_Mobile','Sp_Desktop']) #create blank DataFrame
df['dt'] = pd.to_datetime(df['dt']) # change to date column
df.astype({'Sp_Mobile': 'float64'}).dtypes
df.astype({'Sp_Desktop': 'float64'}).dtypes
for f in files: # list of files looping
f_name,f_extension=os.path.splitext(path+f)
if(f_extension=='.php'): # To check only .php file extensions
size=os.path.getsize(path+f)
t_stamp=os.path.getmtime(path+f) # for file modificaton time
#t_stamp=os.path.getctime(path) # for file Creation time
dt_mod = datetime.fromtimestamp(t_stamp) # date object
#print('File Modified on:', dt_mod) # Prting date and time
m_date = datetime.strftime(dt_mod, '%Y-%m-%d') # Change format
fob= open(path+f,'r',encoding='utf8',errors='ignore')
#print(path+f)
data=fob.read() # collect data
if data.find('</head>')<0: # not present
tag_head=0
else:
tag_head=1
if data.find('canonical')<0: # not present
tag_canonical=0
else:
tag_canonical=1
if data.find('breadcrumb')<0: # not presetn
tag_nav=0
else:
tag_nav=1
#url="https://www.plus2net.com/jquery/"+d+"/"+f # site name + directory + file name
url="https://www.plus2net.com/"+d+"/"+f # site name + directory + file name
df2 = pd.DataFrame({'url':url,'f_name': [f], 'dt': [m_date],'size':[size],'todo':1,
'tag_head':tag_head,'tag_canonical':tag_canonical,'tag_nav':tag_nav,
'Sp_Mobile':0,'Sp_Desktop':0})
df=pd.concat([df,df2]) # add row to DataFrame
df.to_excel(f_x,index=False) # create excel file with file data
df.to_sql(con=my_conn,name=d,if_exists='replace',index=False,dtype={"Sp_Mobile": Float,"Sp_Desktop":Float}) # to store in database table
############### Part 2 creating site map #######
for d in l1:
##path = "C:\\xampp\\htdocs\\plus2net\\"+d+"\\" # full path to directory
##path = "C:\\xampp\\htdocs\\z~1234-plus2net\\jquery\\"+d+"\\" # full path to directory
##f_x='D:\\testing\\sitemap\\'+d+'.xlsx'
df=pd.read_excel(f_x) # read the file and create Dataframe
#condition=(df['tag_canonical']<3) & (df['tag_head']==1)
condition=(df['tag_head']!=0) # exclude some files based on condition
df=df.loc[condition]
my_str='<?xml version="1.0" encoding="UTF-8"?>\n'
my_str=my_str+'<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">\n'
def print_row(row):
global my_str,d
#print(f"Name: {row['f_name']}, Date: {row['dt']}")
my_str=my_str+f"<url><loc>{row['url']}</loc><lastmod>{row['dt']}</lastmod></url>\n"
df.apply(print_row, axis=1)
my_str=my_str+"</urlset>"
path=path+"sitemap.xml"
with open(path, 'w') as file:
file.write(my_str)
print("For ",d," Total : ",len(df))
noindex
Some pages should be excluded from the sitemap, but not added to the noindex
list. A common example is a "Thank You" page displayed after a user submits a form.
Another example is the result page of a demo that's part of a larger tutorial. While these pages are important for user interaction, they are not meant to be accessed directly or independently by search engines.
To handle this, I’ve implemented a simple tagging system. By adding a custom tag (such as <!-- plus2net_no_sitemap_include -->
) near the bottom of such pages, we can clearly mark them for exclusion.
My sitemap generator script reads this tag and automatically skips those pages during XML generation. Additionally, the script records this tag status in a dedicated column of the Excel report for better tracking and control.
This script scans specified folders for `.php` files, extracts metadata such as last modified date, canonical tags, breadcrumbs, and more. It stores this data in both a SQLite database and Excel file. It also generates a sitemap.xml
file, excluding files marked with a special tag like <!-- plus2net_no_sitemap_include -->
.
## this will update the mysql and sitemap inside excel page list.
import os
from datetime import datetime
import pandas as pd
from sqlalchemy import create_engine, Float
my_conn = create_engine("sqlite:///E:\\testing3\\sitemap\\data\\plus2net.db")
my_conn = my_conn.connect()
l1 = ['php_tutorial']
path1 = "C:\\xampp\\htdocs\\plus2net\\"
f_x1 = 'E:\\testing3\\sitemap\\data\\'
for d in l1:
path = path1 + d + "\\"
f_x = f_x1 + d + '.xlsx'
files = os.listdir(path)
df = pd.DataFrame(columns=['url','f_name','dt','size','tag_nav','Sp_Mobile','Sp_Desktop'])
df['dt'] = pd.to_datetime(df['dt'])
df.astype({'Sp_Mobile': 'float64', 'Sp_Desktop': 'float64'})
for f in files:
f_name, f_extension = os.path.splitext(path + f)
if f_extension == '.php':
size = os.path.getsize(path + f)
t_stamp = os.path.getmtime(path + f)
dt_mod = datetime.fromtimestamp(t_stamp)
m_date = datetime.strftime(dt_mod, '%Y-%m-%d')
fob = open(path + f, 'r', encoding='utf8', errors='ignore')
data = fob.read()
tag_head = 0 if '</head>' not in data else 1
tag_canonical = 0 if 'canonical' not in data else 1
tag_nav = 0 if 'breadcrumb' not in data else 1
tag_sitemap = 1 if 'plus2net_no_sitemap_include' not in data else 0
url = "https://www.plus2net.com/" + d + "/" + f
df2 = pd.DataFrame({'url': url, 'f_name': [f], 'dt': [m_date],
'size': [size], 'todo': 1,
'tag_head': tag_head, 'tag_canonical': tag_canonical,
'tag_nav': tag_nav, 'tag_sitemap': tag_sitemap,
'Sp_Mobile': 0, 'Sp_Desktop': 0})
df = pd.concat([df, df2])
df.to_excel(f_x, index=False)
df.to_sql(con=my_conn, name=d, if_exists='replace', index=False, dtype={"Sp_Mobile": Float, "Sp_Desktop": Float})
df = pd.read_excel(f_x)
condition = (df['tag_head'] != 0) & (df['tag_sitemap'] != 0)
df = df.loc[condition]
my_str = '<?xml version="1.0" encoding="UTF-8"?>\n'
my_str += '<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">\n'
def print_row(row):
global my_str, d
my_str += f"<url><loc>{row['url']}</loc><lastmod>{row['dt']}</lastmod></url>\n"
df.apply(print_row, axis=1)
my_str += "</urlset>"
path += "sitemap.xml"
with open(path, 'w') as file:
file.write(my_str)
print("For ", d, " Total : ", len(df))
Author
🎥 Join me live on YouTubePassionate about coding and teaching, I publish practical tutorials on PHP, Python, JavaScript, SQL, and web development. My goal is to make learning simple, engaging, and project‑oriented with real examples and source code.