Skip to content

Python package docs

Akash Mahanty edited this page Jan 5, 2021 · 86 revisions

You are currently reading waybackpy docs to use it as a python library. If you want to use waybackpy as CLI tool visit our CLI docs.


  • An instance of the Url class (<class 'waybackpy.wrapper.Url'>) is created when you use waybackpy.Url(url, user_agent)

  • url and user_agent must be strings.


Contents

Archiving or Saving a webpage

import waybackpy

url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"

wayback = waybackpy.Url(url, user_agent) # <class 'waybackpy.wrapper.Url'>

archive = wayback.save() # <class 'waybackpy.wrapper.Url'>

archive_url = archive.archive_url # <class 'str'>
timestamp  = archive.timestamp  # <class 'datetime.datetime'>

print(archive_url)
print(timestamp)
https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus
2021-01-05 12:12:33.264205

Try this out in your browser @ https://repl.it/@akamhy/WaybackPySaveExample

Exception/Error Handling
  • Sometimes the Wayback Machine may deny your archiving requests and not save the webpage. waybackpy will raise 'WaybackError' if your request failed.
url = "https://github.com/akamhy/waybackpy/this-page-doesn't-exit" # This webpage doesn't exist (404), therefore can't archive.
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent)
archive = wayback.save()
WaybackError: No archive URL found in the API response. If 'https://github.com/akamhy/waybackpy/this
-page-doesn't-exit' can be accessed via your web browser then either this version of waybackpy (2.3.3) 
is out of date or WayBack Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' for 
the latest version of waybackpy.
Header:
{'Server': 'nginx/1.15.8', 'Date': 'Sun, 03 Jan 2021 09:41:47 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 
'chunked', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', 'X-App-Server': 'wwwb-app14', 'X-ts': '523', 'X-RL': '0', 'X-Page-
Cache': 'MISS', 'X-Archive-Screenname': '0'}
  • You may handle it (WaybackError) using a try except block.
import waybackpy
from waybackpy.exceptions import WaybackError

url = "https://github.com/akamhy/waybackpy/this-page-doesn't-exit"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"

wayback = waybackpy.Url(url, user_agent)

try:
     archive = wayback.save()
except WaybackError as e:
    pass # handle as you like!

Get archive of webpage

Retrieving the oldest archive for an URL using oldest()
import waybackpy

url = "https://www.google.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"

wayback = waybackpy.Url(url, user_agent) # <class 'waybackpy.wrapper.Url'>
archive = wayback.oldest() # <class 'waybackpy.wrapper.Url'>

archive_url = archive.archive_url # <class 'str'>
timestamp = archive.timestamp # <class 'datetime.datetime'>
json_res = archive.JSON # <class 'dict'>
days_old = len(archive) # <class 'int'>


print(archive_url)
print(timestamp)
print(json_res)
print(days_old)
https://web.archive.org/web/19981111184551/http://google.com:80/
1998-11-11 18:45:51
{'archived_snapshots': {'closest': {'timestamp': '19981111184551', 'available': True, 'status': '200', 'url': 'http://web.archive.org/web/19981111184551/http://google.com:80/'}}, 'timestamp': '199401051310', 'url': 'https://www.google.com/'}
8090

Try this out in your browser @ https://repl.it/@akamhy/WaybackPyOldestExample

Retrieving the newest archive for an URL using newest()
import waybackpy

url = "https://www.facebook.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"

wayback = waybackpy.Url(url, user_agent) # <class 'waybackpy.wrapper.Url'>
archive = wayback.newest() # <class 'waybackpy.wrapper.Url'>

archive_url = archive.archive_url # <class 'str'>
timestamp = archive.timestamp # <class 'datetime.datetime'>
json_res = archive.JSON # <class 'dict'>
days_old = len(archive) # <class 'int'>


print(archive_url)
print(timestamp)
print(json_res)
print(days_old)
https://web.archive.org/web/20210105130110/https://www.facebook.com/
2021-01-05 13:01:10
{'timestamp': '202101051313', 'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'timestamp': '20210105130110', 'available': True, 'status': '200', 'url': 'http://web.archive.org/web/20210105130110/https://www.facebook.com/'}}}
0

Try this out in your browser @ https://repl.it/@akamhy/WaybackPyNewestExample

Retrieving archive close to a specified year, month, day, hour, and minute using near()
from waybackpy import Url

url = "https://github.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"


wayback = Url(url, user_agent)

github_archive_near_2010 = wayback.near(year=2010)

archive_url = github_archive_near_2010.archive_url # <class 'str'>
timestamp = github_archive_near_2010.timestamp # <class 'datetime.datetime'>
json_res = github_archive_near_2010.JSON # <class 'dict'>
days_old = len(github_archive_near_2010) # <class 'int'>

print(archive_url)
print(timestamp)
print(json_res)
print(days_old)


print("\n\n")

# NOTICE : Do not pad the day, hour and minute etc, they must be integers <class 'int'>
github_archive_near_2018_4th_july_9_2_am = wayback.near(year=2018, month=7, day=4, hour=9, minute=2)

archive_url = github_archive_near_2018_4th_july_9_2_am.archive_url # <class 'str'>
timestamp = github_archive_near_2018_4th_july_9_2_am.timestamp # <class 'datetime.datetime'>
json_res = github_archive_near_2018_4th_july_9_2_am.JSON # <class 'dict'>
days_old = len(github_archive_near_2018_4th_july_9_2_am) # <class 'int'>

print(archive_url)
print(timestamp)
print(json_res)
print(days_old)
https://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus
2010-02-15 00:15:41
{'url': 'https://en.wikipedia.org/wiki/Multivariable_calculus', 'timestamp': '201001051321', 'archived_snapshots': {'closest': {'url': 'http://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus', 'status': '200', 'timestamp': '20100215001541', 'available': True}}}
3977



https://web.archive.org/web/20180615033754/https://en.wikipedia.org/wiki/Multivariable_calculus
2018-06-15 03:37:54
{'url': 'https://en.wikipedia.org/wiki/Multivariable_calculus', 'timestamp': '201807040902', 'archived_snapshots': {'closest': {'url': 'http://web.archive.org/web/20180615033754/https://en.wikipedia.org/wiki/Multivariable_calculus', 'status': '200', 'timestamp': '20180615033754', 'available': True}}}
935

Try this out in your browser @ https://repl.it/@akamhy/WaybackPyNearExample


Get the content of webpage using get()

import waybackpy

url = "https://stackoverflow.com/"

user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"

wayback = waybackpy.Url(url, user_agent)


# If no argument is passed in get(), it gets the source of the Url used to create the object.
# This will get source code of https://stackoverflow.com/
current_stackoverflow_url_source = wayback.get()
print(current_stackoverflow_url_source)


# The following 2 lines of code will force a new archive of https://stackoverflow.com/ 
# and get the source code of the archived page.
stackoverflow_newest_archive_source = wayback.get(wayback.save())
print(stackoverflow_newest_archive_source)


# Source code of oldest archive of https://stackoverflow.com/
stackoverflow_oldest_archive_source = wayback.get(wayback.oldest())
print(stackoverflow_oldest_archive_source)

Try this out in your browser @ https://repl.it/@akamhy/WaybackPyGetExample#main.py


Count total archives for an URL using total_archives()

import waybackpy

URL = "https://en.wikipedia.org/wiki/Python (programming language)"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"

wayback = waybackpy.Url(url=URL, user_agent=UA)

total_archives = wayback.total_archives() # <class 'int'>

print(total_archives)
2550

Try this out in your browser @ https://repl.it/@akamhy/WaybackPyTotalArchivesExample


List of URLs that Wayback Machine knows and has archived for a domain name

  1. If alive=True is set, waybackpy will check all URLs to identify the alive URLs. Don't use with popular websites like google or it would take too long.
  2. To include URLs from subdomain set sundomain=True
import waybackpy

URL = "akamhy.github.io"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"

wayback = waybackpy.Url(url=URL, user_agent=UA)

known_urls = wayback.known_urls(alive=True, subdomain=False) # <class 'list'>

print(known_urls)
['http://akamhy.github.io',
'https://akamhy.github.io/waybackpy/',
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=a418a4e4641a1dbaad8f3bfbf293fad21a75ff11',
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=f881705d00bf47b5bf0c58808efe29eecba2226c']

Try this out in your browser @ https://repl.it/@akamhy/WaybackPyKnownURLsToWayBackMachineExample#main.py

Cdx

The docs for Cdx is not complete yet but the following code maybe be useful.

from waybackpy import Cdx # waybackpy >= 2.3.3
url = "https://github.com/akamhy/*"
user_agent = "Your-apps-user-agent"

cdx = Cdx(url=url, user_agent=user_agent)
snapshots = cdx.snapshots()

for snapshot in snapshots:
    print(snapshot.archive_url)
Clone this wiki locally