-
Notifications
You must be signed in to change notification settings - Fork 35
Python package docs
You are currently reading waybackpy docs to use it as a python library. If you want to use waybackpy as CLI tool visit our CLI docs.
-
An instance of the Url class (<class 'waybackpy.wrapper.Url'>) is created when you use waybackpy.Url(url, user_agent)
-
url and user_agent must be strings.
- Archiving/Saving a webpage
- Get archive of webpage
- Get the text of archive or the webpage
- Count total number of archives for a webpage
- List of URLs that Wayback Machine knows and has archived for a domain name
- Cdx
import waybackpy
url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent) # <class 'waybackpy.wrapper.Url'>
archive = wayback.save() # <class 'waybackpy.wrapper.Url'>
archive_url = archive.archive_url # <class 'str'>
timestamp = archive.timestamp # <class 'datetime.datetime'>
print(archive_url)
print(timestamp)https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus
2021-01-05 12:12:33.264205Try this out in your browser @ https://repl.it/@akamhy/WaybackPySaveExample
- Sometimes the Wayback Machine may deny your archiving requests and not save the webpage. waybackpy will raise 'WaybackError' if your request failed.
url = "https://github.com/akamhy/waybackpy/this-page-doesn't-exit" # This webpage doesn't exist (404), therefore can't archive.
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent)
archive = wayback.save()WaybackError: No archive URL found in the API response. If 'https://github.com/akamhy/waybackpy/this
-page-doesn't-exit' can be accessed via your web browser then either this version of waybackpy (2.3.3)
is out of date or WayBack Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' for
the latest version of waybackpy.
Header:
{'Server': 'nginx/1.15.8', 'Date': 'Sun, 03 Jan 2021 09:41:47 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding':
'chunked', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', 'X-App-Server': 'wwwb-app14', 'X-ts': '523', 'X-RL': '0', 'X-Page-
Cache': 'MISS', 'X-Archive-Screenname': '0'}- You may handle it (WaybackError) using a try except block.
import waybackpy
from waybackpy.exceptions import WaybackError
url = "https://github.com/akamhy/waybackpy/this-page-doesn't-exit"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent)
try:
archive = wayback.save()
except WaybackError as e:
pass # handle as you like!import waybackpy
url = "https://www.google.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent) # <class 'waybackpy.wrapper.Url'>
archive = wayback.oldest() # <class 'waybackpy.wrapper.Url'>
archive_url = archive.archive_url # <class 'str'>
timestamp = archive.timestamp # <class 'datetime.datetime'>
json_res = archive.JSON # <class 'dict'>
days_old = len(archive) # <class 'int'>
print(archive_url)
print(timestamp)
print(json_res)
print(days_old)https://web.archive.org/web/19981111184551/http://google.com:80/
1998-11-11 18:45:51
{'archived_snapshots': {'closest': {'timestamp': '19981111184551', 'available': True, 'status': '200', 'url': 'http://web.archive.org/web/19981111184551/http://google.com:80/'}}, 'timestamp': '199401051310', 'url': 'https://www.google.com/'}
8090Try this out in your browser @ https://repl.it/@akamhy/WaybackPyOldestExample
import waybackpy
url = "https://www.facebook.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
wayback = waybackpy.Url(url, user_agent) # <class 'waybackpy.wrapper.Url'>
archive = wayback.newest() # <class 'waybackpy.wrapper.Url'>
archive_url = archive.archive_url # <class 'str'>
timestamp = archive.timestamp # <class 'datetime.datetime'>
json_res = archive.JSON # <class 'dict'>
days_old = len(archive) # <class 'int'>
print(archive_url)
print(timestamp)
print(json_res)
print(days_old)https://web.archive.org/web/20210105130110/https://www.facebook.com/
2021-01-05 13:01:10
{'timestamp': '202101051313', 'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'timestamp': '20210105130110', 'available': True, 'status': '200', 'url': 'http://web.archive.org/web/20210105130110/https://www.facebook.com/'}}}
0Try this out in your browser @ https://repl.it/@akamhy/WaybackPyNewestExample
from waybackpy import Url
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"
url = "https://github.com/"
waybackpy_url_obj = Url(url, user_agent)
# Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.github_archive_near_2010 = waybackpy_url_obj.near(year=2010)
print(github_archive_near_2010)https://web.archive.org/web/20101018053604/http://github.com:80/github_archive_near_2011_may = waybackpy_url_obj.near(year=2011, month=5)
print(github_archive_near_2011_may)https://web.archive.org/web/20110518233639/https://github.com/github_archive_near_2015_january_26 = waybackpy_url_obj.near(year=2015, month=1, day=26)
print(github_archive_near_2015_january_26)https://web.archive.org/web/20150125102636/https://github.com/github_archive_near_2018_4_july_9_2_am = waybackpy_url_obj.near(year=2018, month=7, day=4, hour=9, minute=2)
print(github_archive_near_2018_4_july_9_2_am)https://web.archive.org/web/20180704090245/https://github.com/The package doesn't support the seconds' argument yet. You are encouraged to create a PR ;)
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyNearExample
import waybackpy
google_url = "https://www.google.com/"
User_Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
waybackpy_url_object = waybackpy.Url(google_url, User_Agent)
# If no argument is passed in get(), it gets the source of the Url used to create the object.
current_google_url_source = waybackpy_url_object.get()
print(current_google_url_source)
# The following chunk of code will force a new archive of google.com and get the source of the archived page.
# waybackpy_url_object.save() type is string.
google_newest_archive_source = waybackpy_url_object.get(waybackpy_url_object.save())
print(google_newest_archive_source)
# waybackpy_url_object.oldest() type is str, it's oldest archive of google.com
google_oldest_archive_source = waybackpy_url_object.get(waybackpy_url_object.oldest())
print(google_oldest_archive_source)Try this out in your browser @ https://repl.it/@akamhy/WaybackPyGetExample#main.py
import waybackpy
URL = "https://en.wikipedia.org/wiki/Python (programming language)"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
archive_count = waybackpy_url_object.total_archives()
print(archive_count) # total_archives() returns an int2516Try this out in your browser @ https://repl.it/@akamhy/WaybackPyTotalArchivesExample
- If alive=True is set, waybackpy will check all URLs to identify the alive URLs. Don't use with popular websites like google or it would take too long.
- To include URLs from subdomain set sundomain=True
import waybackpy
URL = "akamhy.github.io"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
known_urls = waybackpy_url_object.known_urls(alive=True, subdomain=False) # alive and subdomain are optional.
print(known_urls) # known_urls() returns list of URLs['http://akamhy.github.io',
'https://akamhy.github.io/waybackpy/',
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=a418a4e4641a1dbaad8f3bfbf293fad21a75ff11',
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=f881705d00bf47b5bf0c58808efe29eecba2226c']Try this out in your browser @ https://repl.it/@akamhy/WaybackPyKnownURLsToWayBackMachineExample#main.py
The docs for Cdx is not complete yet but the following code maybe be useful.
from waybackpy import Cdx # waybackpy >= 2.3.3
url = "https://github.com/akamhy/*"
user_agent = "Your-apps-user-agent"
cdx = Cdx(url=url, user_agent=user_agent)
snapshots = cdx.snapshots()
for snapshot in snapshots:
print(snapshot.archive_url)