conda install -c anaconda beautiful-soup
pip install beautifulsoup4
To read XML files
pip install lxml
import requests
link = "https://www.plus2net.com/html_tutorial/html-canvas.php"
content = requests.get(link)
print(content.text)
The full page ( HTML code ) is printed by above code.
from bs4 import BeautifulSoup
soup = BeautifulSoup(content.text, 'html.parser')
Note that we created soup as our object and we will further use this object to traverse the HTML code to get the different nodes. Here we have used html.parser
. We can also use other options, like 'lxml'
and 'html5lib'
for different requirments.
import requests
link = "https://www.plus2net.com/html_tutorial/html-canvas.php"
content = requests.get(link)
from bs4 import BeautifulSoup
soup = BeautifulSoup(content.text, 'html.parser')
print(soup.title) # gets title tag
print(soup.h1) # gets H1 tag
print(soup.h1.string)
#print(soup.title.parent) # full string within the parent tag
print(soup.title.parent.name)
Output is here
<title>Canvas html <canvas> tag to draw lines or graphics or animation in web page</canvas></title>
<h1 itemprop="headline"><canvas> HTML Canvas tag</h1>
<canvas> HTML Canvas tag
head
Collecting all <h2> tags
h2_tags=soup.find_all('h2') # List of all h2 tags
h2_strings = [tag.string for tag in h2_tags] # remove the h2 tag and keep only string part
We can use string as input to create BeautifulSoup object
content = """<html>
<head>
<title>Your title of the page here</title>
<META NAME='DESCRIPTION' CONTENT='my description '>
<META NAME='KEYWORDS' CONTENT='kw1,kw2,kw3'>
</head>
<body>
Hello <br>
Welcome to plus2net.com
</body>
</html>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
print(soup.title) # gets title
print(soup.h1) # gets H1 tag
print(soup.title.name) # name of the tag i.e title
print(soup.title.string)
print(soup.title.parent) # full string within the parent tag
print(soup.title.parent.name)
Output is here
<title>Your title of the page here</title>
None
title
Your title of the page here
<head>
<title>Your title of the page here</title>
<meta content="my description " name="DESCRIPTION"/>
<meta content="kw1,kw2,kw3" name="KEYWORDS"/>
</head>
head
Printing other tags
print(soup.h2) # gets H2 tag
print(soup.h2.string) # print text associated with h2 tag
We have two meta tags here, we will get the content attribute of first meta tag and then second meta tags the keywords by using content attribute.
print(soup.meta['content']) # first meta tag, attribute content
print(soup.find_all('meta')[1]["content"]) # second meta tag
Output
my description
kw1,kw2,kw3
import requests
link = "https://youtu.be/SPw6SfN9beg"
content = requests.get(link)
from bs4 import BeautifulSoup
soup = BeautifulSoup(content.text, 'html.parser')
print(soup.title.string) # gets title tag
d =soup.find_all('meta')
print(d[4]['content']) # all keywords
Output
Tkinter frame to hold widgets and images with background colour border and managing layout - YouTube
Tkinter frame to hold multiple widgets, adding image to frame, background colour of the frame, border thickness of frame, frame border colour, border on focus, removing frame using forget(), grid_forget(), pack_forget(), adding multiple frames, delete and restore frames, creating frame using grid and pack layout, Image on label and frame, button on click event remove frame, button on click restore frame
import requests
link="https://www.plus2net.com"
content = requests.get(link)
from bs4 import BeautifulSoup
soup = BeautifulSoup(content.text, 'html.parser')
my_links = []
for ln in soup.findAll('a'):
my_links.append(ln.get('href'))
print(my_links)
<link>
tags.import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine,text
my_conn = create_engine("mysql+mysqldb://id:pw@localhost/my_tutorial")
my_conn=my_conn.connect() # add this line if error
table_name='java_tutorial' # or Folder name
q='SELECT url,f_name from `'+table_name+ '` WHERE tag_canonical=1 AND tag_head=1'
#q=q+ ' AND f_name !="html-head.php" ' ## to remove any additional file if any
my_data=my_conn.execute(text(q)) # SQLAlchemy my_conn result
#print(q)
for row in my_data:
#print(row)
link="http://localhost/plus2net/"+table_name+"/"+row[1]
content = requests.get(link)
soup = BeautifulSoup(content.text, 'html.parser')
l1=soup.find_all('link')
for r in l1:
if r['rel'][0]=='canonical': # If it is canonical link
if(r['href']!=row[0]): # if canonical URL is not matching
print(row[0],r['href'])
print(r)
This Python script fetches a webpage and extracts all <h2>
tags using the BeautifulSoup library to create a dynamic table of contents in HTML format. The HTML structure includes collapsible content, clickable links for easy navigation, and the result is copied to the clipboard using pyperclip. The script automates the process of generating organized and interactive table of contents from a webpage's headings.
import requests
import pyperclip # to copy to clipboard
# address or URL of the page
link = "http://localhost/plus2net/php_tutorial/sqlite-paging.php"
content = requests.get(link)
from bs4 import BeautifulSoup
soup = BeautifulSoup(content.text, 'html.parser')
h2_tags=soup.find_all('h2')
h2_strings = [tag.string for tag in h2_tags]
#print(h2_strings)
str1="<a class='btn btn-outline-danger' data-toggle='collapse' href='#collapseExample' role='button' aria-expanded='false' aria-controls='collapseExample'>Show Table of Content <span aria-hidden='true'>↓</span> </a>n<div class='collapse' id='collapseExample'><div class='card card-body'><UL>"
str2="<LI><a href=#>"
str3="</a></LI>"
str4="</UL></div></div><br>"
my_str=str1
for opt in h2_strings:
if opt is not None:
my_str= my_str + str2 + opt + str3 + 'n'
my_str=my_str + str4
pyperclip.copy(my_str) # copy the output to clipboard
print(my_str)