Webscraper & HTML Text Comparison Tool
One struggle I’ve had in decentralized documentation environments is knowing when someone else updates documentation that is maintained outside my purview.
In one instance at work, several separate GitHub Pages sites were being maintained outside my purview, but I needed to know when updates were made on these sites so I could reconcile them with my documentation. Unfortunately, the other sites’ maintainers wanted to keep their documentation efforts independent of mine, leaving me with a need to know when those other sites were updated. I also couldn’t rely on the site maintainers to let me know when they made updates; people, it turns out, are forgetful.
The only solution to this problem was automation: a solution that would automatically check for site updates on a defined cadence and let me know what, if any changes, were present.
Below is my effort. I wrote this in Python over the course of a day or so. It’s best suited for Mac and Linux machines, but it can be updated with minimal effort for Windows systems. Using cron
, I set this to execute automatically every day at 9:00 am.
If you want to use this tool, simply update the URLs and labels in the Python dictionary. I’ve added functions whereby you can indicate if you want to check for other websites at run time. This function is currently turned off; to turn them on, reverse the default
arguments in __main__
.
import os
from pathlib import Path
from bs4 import BeautifulSoup
import urllib3
import certifi
import difflib
from datetime import datetime
import webbrowser
import sys
import platform
python webScraper.py
# Constants
# Update before running code
HTML_BENCHMARKS_PATH = Path('/Users/benjaminmoran/Desktop/html-benchmarks/')
URL_DICT = {
"cornell": "https://it.cornell.edu/web-hosting-static/how-test-your-static-sites",
"wikipedia": "https://en.wikipedia.org",
"portfolio_git": "https://benbarksdale.netlify.app/docs/guides/introduction-to-git-for-technical-writers/",
"github_sphinx_test":"https://github.com/redsoxfan0219/sphinx-github-action"
}
# Other global variables
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
date = datetime.now().strftime("%Y-%m-%d")
def get_operating_system():
if sys.platform.startswith("win"):
operating_system = "windows"
if sys.platform == "darwin":
operating_system = "mac"
else:
operating_system = "linux"
return operating_system
operating_system = get_operating_system()
site_list = []
## Define Site class
class Site:
"""
Creates a site object to house name, url, and methods to get
html benchmark and current site html
"""
def __init__(self, name: str, url: str):
self.name = name
self.url = url
self.file_path = self.set_file_path()
self.html_benchmark = self.get_html_benchmark()
self.present_site_html = self.get_present_site_html()
def get_url(self):
"""
Getter for site's url attribute.
"""
return self.url
def set_url(self, url):
"""
Setter for site's url attribute.
"""
self.url = url
def get_name(self):
"""
Getter for site's name attribute.
"""
return self.name
def set_name(self, name: str):
"""
Setter for site's name attribute.
"""
self.name = name
def set_file_path(self):
"""
Setter for site's file_path attribute.
"""
file_name = HTML_BENCHMARKS_PATH / f'{self.name}.html'
return file_name
def get_file_path(self):
"""
Getter for site's file_path attribute.
"""
return self.file_path
def get_present_site_html(self):
"""
GETs site's HTML from web.
Returns string of HTML.
"""
response = http.request('GET', self.url)
if response.status == 200:
soup = BeautifulSoup(response.data, 'html.parser')
return soup.prettify()
else:
print(f"Failed to retrieve HTML for {self.name}. Status code: {response.status}")
return None
def get_html_benchmark(self):
"""
Retrieves contents of HTML benchmark file and assigns to
html_benchmark attribute.
"""
try:
benchmark_file_path = HTML_BENCHMARKS_PATH / f'{self.name}.html'
if os.path.exists(benchmark_file_path):
with open(benchmark_file_path, 'r') as f:
file_contents = f.read()
return file_contents
else:
self.create_new_html_benchmark()
ColoredOutput.print_blue(f"Creating a new HTML benchmark for {self.name}")
print("")
return self.html_benchmark
except Exception:
return None
def create_new_html_benchmark(self):
"""
Creates a new HTML benchmark file and assigns file's contents to
html_benchmark attribute.
"""
benchmark_file_path = HTML_BENCHMARKS_PATH / f'{self.name}.html'
if not os.path.exists(benchmark_file_path):
file_contents = self.get_present_site_html()
with open(benchmark_file_path, 'w+') as f:
f.write(file_contents)
self.html_benchmark = file_contents
def replace_existing_benchmark(self):
"""
Replaces existing benchmark file with one based on
site's current HTML.
Intended for execution at end of program.
"""
benchmark_file_path = HTML_BENCHMARKS_PATH / f'{self.name}.html'
if os.path.exists(benchmark_file_path):
with open(benchmark_file_path, 'w') as f:
file_contents = self.present_site_html
f.write(file_contents)
self.html_benchmark = file_contents
def produce_differences_file_benchmark_and_present_html(self):
"""
Creates an output HTML file comparing benchmark and current HTML.
Sets comparison_file attribute equal to path of comparison file.
"""
benchmark_split = self.html_benchmark.splitlines()
present_html_split = self.present_site_html.splitlines()
differ = difflib.HtmlDiff(wrapcolumn=70)
table_of_diffs = differ.make_file(benchmark_split, present_html_split)
target_file = HTML_BENCHMARKS_PATH.joinpath(f"{self.name}_benchmark_present-comparison-{date}.html")
with open(target_file, 'w+') as f:
f.write(table_of_diffs)
self.comparison_file = target_file
self.comparison_created_already_today = True
def open_comparison_file_with_browser(self):
"""
Opens comparison file if one exists.
"""
if hasattr(self, 'comparison_file'):
try:
if operating_system in ["mac", "linux"]:
webbrowser.open_new_tab(f'file://{self.comparison_file}')
else:
webbrowser.open_new_tab(self.comparison_file)
except Exception as e:
print(f"An error occurred: {e}")
def check_for_sameday_comparison(self):
"""
Sets attribute if comparison file has been generated already today.
"""
for file in os.listdir(HTML_BENCHMARKS_PATH):
if f"{self.name}_benchmark_present-comparison-{date}.html" == file:
self.comparison_created_already_today = True
class ColoredOutput:
"""
A class for changing text color in standard output.
"""
# ANSI escape codes for text colors
RED = "\033[91m"
BLUE = "\033[94m"
RESET = "\033[0m" # Reset color to default
@staticmethod
def print_red(text):
"""
Print text in red color.
"""
print(ColoredOutput.RED + text + ColoredOutput.RESET)
@staticmethod
def print_blue(text):
"""
Print text in blue color.
"""
print(ColoredOutput.BLUE + text + ColoredOutput.RESET)
# Alternatively, you can use platform.system()
os_name = platform.system()
def _use_different_sites(use_defaults=False):
"""
Provides user chance to build their own URL_DICT
Can silence in future runs by setting use_defaults=True
"""
global URL_DICT
if URL_DICT:
ColoredOutput().print_red("\nA dictionary of sites has already been defined in webScraper.py.\n")
user_input = input("Use the present dictionary? ")
if user_input.lower() in ["no", "n"]:
add_another_entry = ""
while add_another_entry.lower() not in ["no", "n"]:
user_defined_urls_dict = {}
site_shortname = input("Enter a shortname for the site (not URL): ")
site_url = input("Enter URL: ")
user_defined_urls_dict[site_shortname] = site_url
add_another_entry = input("Add another site to list? ")
URL_DICT = user_defined_urls_dict
def _check_user_benchmark_preference(default=False):
"""
Checks if user wants to change HTML benchmarks path.
Can be silenced for future runs by setting default=True.
"""
global HTML_BENCHMARKS_PATH
if default:
return
print(f"Default path for html benchmarks is: {HTML_BENCHMARKS_PATH}\n")
choice = input("Use a different path for HTML benchmarks folder? ")
while True:
if choice.lower() in ("yes", "y"):
user_entry = input("Enter an absolute path: ")
user_provided_dir = Path(user_entry)
if user_provided_dir.is_absolute():
new_path = user_provided_dir / "html-benchmarks"
if not new_path.exists():
try:
new_path.mkdir(parents=True)
if new_path.exists():
print(f"Created directory at {new_path}")
HTML_BENCHMARKS_PATH = new_path
break
except OSError as e:
print(f"Error creating directory: {e}")
else:
HTML_BENCHMARKS_PATH = new_path
print(f"{new_path} set as the path for benchmarks")
break
else:
print("Invalid path provided.")
else:
_create_benchmarks_directory()
break
def _create_benchmarks_directory():
"""
Creates benchmarks directory if it doesn't already exist.
"""
if not HTML_BENCHMARKS_PATH.exists():
os.mkdir(HTML_BENCHMARKS_PATH)
def _create_site_list():
"""
Creates list of Site objects based on the key-value
pairs listed in the URL_DICT
"""
try:
for name, url in URL_DICT.items():
site = Site(name, url)
site_list.append(site)
except Exception as e:
print(f'Error: {e}')
def _update_benchmarks():
print("Updating benchmarks...")
for site in site_list:
site.replace_existing_benchmark()
print("Benchmarks updated.")
def _compare_benchmark_and_present_html():
"""
If there is a difference between the benchmark and the present HTML,
offers to produce a comparison of the two.
Checks if a comparison file has already been produced.
User can recreate daily comparison file.
If no benchmark exists, creates a new benchmark.
"""
for site in site_list:
if site.html_benchmark is not None:
if site.html_benchmark != site.present_site_html:
ColoredOutput.print_red(f"\nWARNING: Difference between {site.get_name()}'s benchmark and present site")
user_option = input("\nCreate an HTML table of the deltas? ")
if user_option.lower() == "yes" or user_option.lower() == "y":
site.check_for_sameday_comparison()
if hasattr(site, "comparison_created_already_today"):
rerun_comparison = input(f"\nA comparison file has already been created today for {site.name}. Replace comparison file? ")
if rerun_comparison.lower() == "yes" or rerun_comparison.lower() == "y":
site.produce_differences_file_benchmark_and_present_html()
else:
site.produce_differences_file_benchmark_and_present_html()
else:
print("")
ColoredOutput.print_blue(f"No difference between {site.get_name()}'s benchmark and present site detected")
print("")
else:
print("")
ColoredOutput.print_blue(f"\nNo existing benchmark for {site.get_name()}.")
ColoredOutput.print_blue("Creating new benchmark file for future comparisons.\n")
site.create_new_html_benchmark()
if hasattr(site, 'comparison_created_already_today'):
wants_to_open_comparison_files = input("Want to open the file(s) outlining the deltas? ")
if wants_to_open_comparison_files.lower() in ["yes","y"]:
for site in site_list:
site.open_comparison_file_with_browser()
if __name__ == "__main__":
ColoredOutput()
_use_different_sites(default=True)
_check_user_benchmark_preference(default=True)
_create_site_list()
_compare_benchmark_and_present_html()
_update_benchmarks()