-
Notifications
You must be signed in to change notification settings - Fork 35
Python package docs
You are currently reading waybackpy docs to use it as a python library. If you want to use waybackpy as CLI tool visit our CLI docs.
-
An instance of the Url class (<class 'waybackpy.wrapper.Url'>) is created when you use waybackpy.Url(url, user_agent)
-
url and user_agent must be strings.
- Archiving/Saving a webpage
- Get archive of webpage
- Get the text of archive or the webpage
- Count total number of archives for a webpage
- List of URLs that Wayback Machine knows and has archived for a domain name
- Cdx
import waybackpy
url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent) # <class 'waybackpy.wrapper.Url'>
archive = wayback.save() # <class 'waybackpy.wrapper.Url'>
archive_url = archive.archive_url # <class 'str'>
timestamp = archive.timestamp # <class 'datetime.datetime'>
print(archive_url)
print(timestamp)
https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus
2021-01-05 12:12:33.264205
Try this out in your browser @ https://repl.it/@akamhy/WaybackPySaveExample
- Sometimes the Wayback Machine may deny your archiving requests and not save the webpage. waybackpy will raise 'WaybackError' if your request failed.
url = "https://github.com/akamhy/waybackpy/this-page-doesn't-exit" # This webpage doesn't exist (404), therefore can't archive.
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent)
archive = wayback.save()
WaybackError: No archive URL found in the API response. If 'https://github.com/akamhy/waybackpy/this
-page-doesn't-exit' can be accessed via your web browser then either this version of waybackpy (2.3.3)
is out of date or WayBack Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' for
the latest version of waybackpy.
Header:
{'Server': 'nginx/1.15.8', 'Date': 'Sun, 03 Jan 2021 09:41:47 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding':
'chunked', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', 'X-App-Server': 'wwwb-app14', 'X-ts': '523', 'X-RL': '0', 'X-Page-
Cache': 'MISS', 'X-Archive-Screenname': '0'}
- You may handle it (WaybackError) using a try except block.
import waybackpy
from waybackpy.exceptions import WaybackError
url = "https://github.com/akamhy/waybackpy/this-page-doesn't-exit"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent)
try:
archive = wayback.save()
except WaybackError as e:
pass # handle as you like!
import waybackpy
url = "https://www.google.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent) # <class 'waybackpy.wrapper.Url'>
archive = wayback.oldest() # <class 'waybackpy.wrapper.Url'>
archive_url = archive.archive_url # <class 'str'>
timestamp = archive.timestamp # <class 'datetime.datetime'>
json_res = archive.JSON # <class 'dict'>
days_old = len(archive) # <class 'int'>
print(archive_url)
print(timestamp)
print(json_res)
print(days_old)
https://web.archive.org/web/19981111184551/http://google.com:80/
1998-11-11 18:45:51
{'archived_snapshots': {'closest': {'timestamp': '19981111184551', 'available': True, 'status': '200', 'url': 'http://web.archive.org/web/19981111184551/http://google.com:80/'}}, 'timestamp': '199401051310', 'url': 'https://www.google.com/'}
8090
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyOldestExample
import waybackpy
url = "https://www.facebook.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent) # <class 'waybackpy.wrapper.Url'>
archive = wayback.newest() # <class 'waybackpy.wrapper.Url'>
archive_url = archive.archive_url # <class 'str'>
timestamp = archive.timestamp # <class 'datetime.datetime'>
json_res = archive.JSON # <class 'dict'>
days_old = len(archive) # <class 'int'>
print(archive_url)
print(timestamp)
print(json_res)
print(days_old)
https://web.archive.org/web/20210105130110/https://www.facebook.com/
2021-01-05 13:01:10
{'timestamp': '202101051313', 'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'timestamp': '20210105130110', 'available': True, 'status': '200', 'url': 'http://web.archive.org/web/20210105130110/https://www.facebook.com/'}}}
0
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyNewestExample
from waybackpy import Url
url = "https://github.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"
wayback = Url(url, user_agent)
github_archive_near_2010 = wayback.near(year=2010)
archive_url = github_archive_near_2010.archive_url # <class 'str'>
timestamp = github_archive_near_2010.timestamp # <class 'datetime.datetime'>
json_res = github_archive_near_2010.JSON # <class 'dict'>
days_old = len(github_archive_near_2010) # <class 'int'>
print(archive_url)
print(timestamp)
print(json_res)
print(days_old)
print("\n\n")
# NOTICE : Do not pad the day, hour and minute etc, they must be integers <class 'int'>
github_archive_near_2018_4th_july_9_2_am = wayback.near(year=2018, month=7, day=4, hour=9, minute=2)
archive_url = github_archive_near_2018_4th_july_9_2_am.archive_url # <class 'str'>
timestamp = github_archive_near_2018_4th_july_9_2_am.timestamp # <class 'datetime.datetime'>
json_res = github_archive_near_2018_4th_july_9_2_am.JSON # <class 'dict'>
days_old = len(github_archive_near_2018_4th_july_9_2_am) # <class 'int'>
print(archive_url)
print(timestamp)
print(json_res)
print(days_old)
https://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus
2010-02-15 00:15:41
{'url': 'https://en.wikipedia.org/wiki/Multivariable_calculus', 'timestamp': '201001051321', 'archived_snapshots': {'closest': {'url': 'http://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus', 'status': '200', 'timestamp': '20100215001541', 'available': True}}}
3977
https://web.archive.org/web/20180615033754/https://en.wikipedia.org/wiki/Multivariable_calculus
2018-06-15 03:37:54
{'url': 'https://en.wikipedia.org/wiki/Multivariable_calculus', 'timestamp': '201807040902', 'archived_snapshots': {'closest': {'url': 'http://web.archive.org/web/20180615033754/https://en.wikipedia.org/wiki/Multivariable_calculus', 'status': '200', 'timestamp': '20180615033754', 'available': True}}}
935
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyNearExample
import waybackpy
url = "https://stackoverflow.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
wayback = waybackpy.Url(url, user_agent)
# If no argument is passed in get(), it gets the source of the Url used to create the object.
# This will get source code of https://stackoverflow.com/
current_stackoverflow_url_source = wayback.get()
print(current_stackoverflow_url_source)
# The following 2 lines of code will force a new archive of https://stackoverflow.com/
# and get the source code of the archived page.
stackoverflow_newest_archive_source = wayback.get(wayback.save())
print(stackoverflow_newest_archive_source)
# Source code of oldest archive of https://stackoverflow.com/
stackoverflow_oldest_archive_source = wayback.get(wayback.oldest())
print(stackoverflow_oldest_archive_source)
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyGetExample#main.py
import waybackpy
URL = "https://en.wikipedia.org/wiki/Python (programming language)"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
wayback = waybackpy.Url(url=URL, user_agent=UA)
total_archives = wayback.total_archives() # <class 'int'>
print(total_archives)
2550
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyTotalArchivesExample
- If alive=True is set, waybackpy will check all URLs to identify the alive URLs. Don't use with popular websites like google or it would take too long.
- To include URLs from subdomain set sundomain=True
import waybackpy
URL = "akamhy.github.io"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
wayback = waybackpy.Url(url=URL, user_agent=UA)
known_urls = wayback.known_urls(alive=True, subdomain=False) # <class 'list'>
print(known_urls)
['http://akamhy.github.io',
'https://akamhy.github.io/waybackpy/',
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=a418a4e4641a1dbaad8f3bfbf293fad21a75ff11',
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=f881705d00bf47b5bf0c58808efe29eecba2226c']
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyKnownURLsToWayBackMachineExample#main.py
The docs for Cdx is not complete yet but the following code maybe be useful.
from waybackpy import Cdx # waybackpy >= 2.3.3
url = "https://github.com/akamhy/*"
user_agent = "Your-apps-user-agent"
cdx = Cdx(url=url, user_agent=user_agent)
snapshots = cdx.snapshots()
for snapshot in snapshots:
print(snapshot.archive_url)