"""
Scraping the Web, Step 1: Get the HTML from a web page.
Summary:
1. install the requests library
2. Then the code to get the HTML from a web page is just:
import requests
url = "http://whatever_URL_you_want
response = requests.get(url)
html = response.text
See:
towardsdatascience.com/how-to-web-scrape-with-python-in-4-minutes-bc49186a8460
for a good, simple tutorial on Scraping the Web.
The examples herein are taken from that site.
Authors: David Mutchler and his colleagues
at Rose-Hulman Institute of Technology.
"""
import requests # You will need to install: requests
import bs4
from bs4 import BeautifulSoup # You will need to install: beautifulsoup4
def main():
# Get the HTML of a page:
url = "http://web.mta.info/developers/turnstile.html" # An example site
response = requests.get(url)
html = response.text
print(html) # Just so that you can see what you are working with.
# Process the HTML. Lots of ways to do so. Here are some examples.
# Find a LINE of the HTML that contains a specific string:
lines = html.split("\n")
index_of_line = -1
for k in range(len(lines)):
line = lines[k]
if "October 08, 2011" in line:
index_of_line = k
break
if index_of_line < 0:
print("No line contained the string")
else:
print("This line contained the string:")
print(lines[index_of_line])
# Find a TAG with an ATTRIBUTE of the HTML
# that contains a specific string, as follows:
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(html, "html.parser")
# Loop through the tags, examining the TEXT of each:
index_of_atag = -1
a_tags = soup.findAll("a")
for k in range(len(a_tags)):
a_tag = a_tags[k]
if "October 08, 2011" in a_tag.text:
index_of_atag = k
break
if index_of_atag < 0:
print("No a-tag had TEXT that contained the string")
else:
print("This a-tag contained the string:")
a_tag = a_tags[index_of_atag]
print(a_tag)
print("Its HREF is:")
print(a_tag["href"])
main()