Skip to content

Commit 0330c33

Browse files
committed
feat: add full history page import/export
1 parent a17f34c commit 0330c33

File tree

3 files changed

+249
-0
lines changed

3 files changed

+249
-0
lines changed

scripts/migration/legacy_articles

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# exports and re-imports the a page with its full history
2+
3+
import os
4+
5+
from osw.auth import CredentialManager
6+
from osw.core import OSW
7+
from osw.wtsite import WtPage, WtSite
8+
9+
# use credentials from file. if none are found, the user will be prompted to enter them
10+
cm = CredentialManager(
11+
cred_filepath=os.path.join(
12+
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "examples", "accounts.pwd.yaml"
13+
)
14+
)
15+
16+
# create the site object
17+
# wtsite = WtSite(WtSite.WtSiteConfig(iri="http://<your-instance>:18081", cred_mngr=cm))
18+
wtsite = WtSite(
19+
WtSite.WtSiteConfig(
20+
# iri="stacktest.digital.isc.fraunhofer.de", cred_mngr=cm
21+
iri="wiki-dev.open-semantic-lab.org", cred_mngr=cm
22+
)
23+
)
24+
osw = OSW(site=wtsite)
25+
26+
# export the page
27+
p = osw.site.get_page(WtSite.GetPageParam(titles=["Main_Page"])).pages[0]
28+
res = p.export_xml()
29+
30+
#with open("Main_Page.xml", "w", encoding="utf8") as f:
31+
# f.write(xml)
32+
33+
# re-import the page (typically in a different instance)
34+
p2 = osw.site.get_page(WtSite.GetPageParam(titles=["Item:TestImport"])).pages[0]
35+
p2.import_xml(WtPage.ImportConfig(
36+
xml=res.xml,
37+
summary="test import",
38+
source_domain="wiki-dev.open-semantic-lab.org",
39+
username_mapping={
40+
"TestUser": "New User",
41+
}
42+
))

src/osw/wtsite.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
import json
66
import os
77
import shutil
8+
import xml.etree.ElementTree as et
89
from datetime import datetime
10+
from io import StringIO
911
from pathlib import Path
1012
from pprint import pprint
1113
from time import sleep
@@ -1056,6 +1058,171 @@ def get_file_info_and_usage(
10561058
def purge(self):
10571059
self._page.purge()
10581060

1061+
class ExportConfig(model.OswBaseModel):
1062+
"""Configuration to export a page to XML"""
1063+
1064+
full_history: Optional[bool] = True
1065+
"""if true, export the full history of the page, else only the current revision"""
1066+
include_templates: Optional[bool] = False
1067+
"""if true, export the templates used in the page"""
1068+
1069+
class ExportResult(model.OswBaseModel):
1070+
"""Return type of export_xml"""
1071+
1072+
xml: str
1073+
"""the XML string"""
1074+
success: bool
1075+
"""if true, the export was successful, else false"""
1076+
1077+
def export_xml(
1078+
self, config: Optional[ExportConfig] = ExportConfig()
1079+
) -> ExportResult:
1080+
"""Exports the page to XML
1081+
1082+
Parameters
1083+
----------
1084+
config, optional
1085+
see ExportConfig
1086+
1087+
Returns
1088+
-------
1089+
ExportResult
1090+
"""
1091+
url = (
1092+
self.wtSite._site.scheme
1093+
+ "://"
1094+
+ self.wtSite._site.host
1095+
+ self.wtSite._site.path
1096+
+ "index.php?title=Special:Export/"
1097+
+ self.title
1098+
)
1099+
data = {
1100+
"title": "Special:Export",
1101+
"catname": "",
1102+
"pages": self.title,
1103+
"wpEditToken": self.wtSite._site.get_token("csrf"),
1104+
"wpDownload": "1",
1105+
}
1106+
if not config.full_history:
1107+
data["curonly"] = "1"
1108+
if config.include_templates:
1109+
data["templates"] = "1"
1110+
response = self.wtSite._site.connection.post(url, data=data)
1111+
if response.status_code != 200:
1112+
return WtPage.ExportResult(success=False, xml="")
1113+
else:
1114+
return WtPage.ExportResult(success=True, xml=response.text)
1115+
1116+
class ImportConfig(model.OswBaseModel):
1117+
"""Configuration to import a page from XML.
1118+
see also https://www.mediawiki.org/wiki/Manual:Importing_XML_dumps"""
1119+
1120+
xml: str
1121+
"""the XML string to import (see WtPage.export_xml)"""
1122+
summary: str
1123+
"""the edit summary to use for the import"""
1124+
source_domain: str
1125+
"""the domain of the instance from which the XML was exported, e.g. mywiki.com"""
1126+
full_history: Optional[bool] = True
1127+
"""if true, import the full history of the page, else only the current revision"""
1128+
include_templates: Optional[bool] = False
1129+
"""if true, import the templates used in the page if contained in the XML"""
1130+
namespace_mapping: Optional[Dict[str, str]] = {
1131+
"Main": 0,
1132+
"File": 6,
1133+
"Template": 10,
1134+
"Category": 14,
1135+
"Item": 7000,
1136+
}
1137+
"""mapping of namespaces names to IDs in the target instance"""
1138+
username_mapping: Optional[Dict[str, str]] = {}
1139+
"""mapping of usernames in the XML to usernames in the target instance"""
1140+
1141+
class ImportResult(model.OswBaseModel):
1142+
"""Return type of import_xml"""
1143+
1144+
success: bool
1145+
"""if true, the import was successful, else false"""
1146+
imported_title: str
1147+
imported_revisions: int
1148+
error_msg: Optional[str] = None
1149+
1150+
def import_xml(self, config: ImportConfig) -> ImportResult:
1151+
"""Imports the page from an XML export
1152+
1153+
Parameters
1154+
----------
1155+
config
1156+
see ImportConfig
1157+
1158+
Returns
1159+
-------
1160+
ExportResult
1161+
"""
1162+
1163+
# remove default namespace definition (see https://stackoverflow.com/questions/34009992/python-elementtree-default-namespace)
1164+
config.xml = config.xml.replace(
1165+
'xmlns="http://www.mediawiki.org', '_xmlns="http://www.mediawiki.org'
1166+
)
1167+
print(config.xml)
1168+
tree = et.fromstring(config.xml)
1169+
1170+
# replace title and namespace with the requested ones
1171+
tree.find(".//title").text = self.title.split(":")[1]
1172+
tree.find(".//ns").text = str(
1173+
config.namespace_mapping.get(self.title.split(":")[0], 0)
1174+
)
1175+
# apply username mapping (user in the target system might have different names)
1176+
for e in tree.findall(".//username"):
1177+
e.text = config.username_mapping.get(e.text, e.text)
1178+
1179+
config.xml = et.tostring(tree, encoding="unicode")
1180+
# restore default namespace definition
1181+
config.xml = config.xml.replace(
1182+
'_xmlns="http://www.mediawiki.org', 'xmlns="http://www.mediawiki.org'
1183+
)
1184+
1185+
api_url = (
1186+
self.wtSite._site.scheme
1187+
+ "://"
1188+
+ self.wtSite._site.host
1189+
+ self.wtSite._site.path
1190+
+ "api.php"
1191+
)
1192+
response = self.wtSite._site.connection.post(
1193+
url=api_url,
1194+
data={
1195+
"action": "import",
1196+
"token": self.wtSite._site.get_token("csrf"),
1197+
"fullhistory": "1" if config.full_history else "0",
1198+
"templates": "1" if config.include_templates else "0",
1199+
"assignknownusers": "1",
1200+
"interwikiprefix": config.source_domain,
1201+
# "namespace": self.title.split(":")[0],
1202+
"summary": config.summary,
1203+
"format": "json",
1204+
},
1205+
files={
1206+
"xml": (
1207+
"xml",
1208+
StringIO(config.xml),
1209+
"text/xml",
1210+
) # read config.xml as file
1211+
},
1212+
)
1213+
1214+
json = response.json()
1215+
if "error" in json:
1216+
# print("Error: ", json)
1217+
return WtPage.ImportResult(success=False, error_msg=json["error"]["info"])
1218+
else:
1219+
# print("Imported: ", json["import"][0]["title"], " with ", json["import"][0]["revisions"], " revisions")
1220+
return WtPage.ImportResult(
1221+
success=True,
1222+
imported_title=json["import"][0]["title"],
1223+
imported_revisions=json["import"][0]["revisions"],
1224+
)
1225+
10591226

10601227
# Updating forwards refs in pydantic models
10611228
WtSite.UploadPageParam.update_forward_refs()
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import uuid
2+
3+
from osw.auth import CredentialManager
4+
from osw.core import OSW
5+
from osw.wtsite import WtPage, WtSite
6+
7+
# run with: tox -e test -- --wiki_domain domain --wiki_username user --wiki_password pass
8+
9+
10+
def _test_ontology_import(wiki_domain, wiki_username, wiki_password):
11+
"""this test does not run with a bot account"""
12+
cm = CredentialManager()
13+
cm.add_credential(
14+
CredentialManager.UserPwdCredential(
15+
iri=wiki_domain, username=wiki_username, password=wiki_password
16+
)
17+
)
18+
wtsite = WtSite(WtSite.WtSiteConfig(iri=wiki_domain, cred_mngr=cm))
19+
20+
osw = OSW(site=wtsite)
21+
22+
p = osw.site.get_page(WtSite.GetPageParam(titles=["Main_Page"])).pages[0]
23+
res = p.export_xml()
24+
assert res.success is True
25+
26+
p2 = osw.site.get_page(
27+
WtSite.GetPageParam(titles=["Item:" + OSW.get_osw_id(uuid.uuid4())])
28+
).pages[0]
29+
res2 = p2.import_xml(
30+
WtPage.ImportConfig(
31+
xml=res.xml,
32+
summary="test import",
33+
source_domain="wiki-dev.open-semantic-lab.org",
34+
)
35+
)
36+
assert res2.success is True
37+
assert res2.imported_title == p2.title
38+
assert res2.imported_revisions > 0
39+
40+
p2.delete()

0 commit comments

Comments
 (0)