feat: add full history page import/export

simontaurus · simontaurus · commit 0330c33153c7 · 2024-01-06T19:31:07.000+01:00
diff --git a/scripts/migration/legacy_articles b/scripts/migration/legacy_articles
@@ -0,0 +1,42 @@
+# exports and re-imports the a page with its full history
+
+import os
+
+from osw.auth import CredentialManager
+from osw.core import OSW
+from osw.wtsite import WtPage, WtSite
+
+# use credentials from file. if none are found, the user will be prompted to enter them
+cm = CredentialManager(
+    cred_filepath=os.path.join(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "examples", "accounts.pwd.yaml"
+    )
+)
+
+# create the site object
+# wtsite = WtSite(WtSite.WtSiteConfig(iri="http://<your-instance>:18081", cred_mngr=cm))
+wtsite = WtSite(
+    WtSite.WtSiteConfig(
+        # iri="stacktest.digital.isc.fraunhofer.de", cred_mngr=cm
+        iri="wiki-dev.open-semantic-lab.org", cred_mngr=cm
+    )
+)
+osw = OSW(site=wtsite)
+
+# export the page
+p = osw.site.get_page(WtSite.GetPageParam(titles=["Main_Page"])).pages[0]
+res = p.export_xml()
+
+#with open("Main_Page.xml", "w", encoding="utf8") as f:
+#    f.write(xml)
+
+# re-import the page (typically in a different instance)
+p2 = osw.site.get_page(WtSite.GetPageParam(titles=["Item:TestImport"])).pages[0]
+p2.import_xml(WtPage.ImportConfig(
+    xml=res.xml,
+    summary="test import",
+    source_domain="wiki-dev.open-semantic-lab.org",
+    username_mapping={
+        "TestUser": "New User",
+    }
+))
diff --git a/src/osw/wtsite.py b/src/osw/wtsite.py
@@ -5,7 +5,9 @@
 import json
 import os
 import shutil
+import xml.etree.ElementTree as et
 from datetime import datetime
+from io import StringIO
 from pathlib import Path
 from pprint import pprint
 from time import sleep
@@ -1056,6 +1058,171 @@ def get_file_info_and_usage(
     def purge(self):
         self._page.purge()
 
+    class ExportConfig(model.OswBaseModel):
+        """Configuration to export a page to XML"""
+
+        full_history: Optional[bool] = True
+        """if true, export the full history of the page, else only the current revision"""
+        include_templates: Optional[bool] = False
+        """if true, export the templates used in the page"""
+
+    class ExportResult(model.OswBaseModel):
+        """Return type of export_xml"""
+
+        xml: str
+        """the XML string"""
+        success: bool
+        """if true, the export was successful, else false"""
+
+    def export_xml(
+        self, config: Optional[ExportConfig] = ExportConfig()
+    ) -> ExportResult:
+        """Exports the page to XML
+
+        Parameters
+        ----------
+        config, optional
+            see ExportConfig
+
+        Returns
+        -------
+            ExportResult
+        """
+        url = (
+            self.wtSite._site.scheme
+            + "://"
+            + self.wtSite._site.host
+            + self.wtSite._site.path
+            + "index.php?title=Special:Export/"
+            + self.title
+        )
+        data = {
+            "title": "Special:Export",
+            "catname": "",
+            "pages": self.title,
+            "wpEditToken": self.wtSite._site.get_token("csrf"),
+            "wpDownload": "1",
+        }
+        if not config.full_history:
+            data["curonly"] = "1"
+        if config.include_templates:
+            data["templates"] = "1"
+        response = self.wtSite._site.connection.post(url, data=data)
+        if response.status_code != 200:
+            return WtPage.ExportResult(success=False, xml="")
+        else:
+            return WtPage.ExportResult(success=True, xml=response.text)
+
+    class ImportConfig(model.OswBaseModel):
+        """Configuration to import a page from XML.
+        see also https://www.mediawiki.org/wiki/Manual:Importing_XML_dumps"""
+
+        xml: str
+        """the XML string to import (see WtPage.export_xml)"""
+        summary: str
+        """the edit summary to use for the import"""
+        source_domain: str
+        """the domain of the instance from which the XML was exported, e.g. mywiki.com"""
+        full_history: Optional[bool] = True
+        """if true, import the full history of the page, else only the current revision"""
+        include_templates: Optional[bool] = False
+        """if true, import the templates used in the page if contained in the XML"""
+        namespace_mapping: Optional[Dict[str, str]] = {
+            "Main": 0,
+            "File": 6,
+            "Template": 10,
+            "Category": 14,
+            "Item": 7000,
+        }
+        """mapping of namespaces names to IDs in the target instance"""
+        username_mapping: Optional[Dict[str, str]] = {}
+        """mapping of usernames in the XML to usernames in the target instance"""
+
+    class ImportResult(model.OswBaseModel):
+        """Return type of import_xml"""
+
+        success: bool
+        """if true, the import was successful, else false"""
+        imported_title: str
+        imported_revisions: int
+        error_msg: Optional[str] = None
+
+    def import_xml(self, config: ImportConfig) -> ImportResult:
+        """Imports the page from an XML export
+
+        Parameters
+        ----------
+        config
+            see ImportConfig
+
+        Returns
+        -------
+            ExportResult
+        """
+
+        # remove default namespace definition (see https://stackoverflow.com/questions/34009992/python-elementtree-default-namespace)
+        config.xml = config.xml.replace(
+            'xmlns="http://www.mediawiki.org', '_xmlns="http://www.mediawiki.org'
+        )
+        print(config.xml)
+        tree = et.fromstring(config.xml)
+
+        # replace title and namespace with the requested ones
+        tree.find(".//title").text = self.title.split(":")[1]
+        tree.find(".//ns").text = str(
+            config.namespace_mapping.get(self.title.split(":")[0], 0)
+        )
+        # apply username mapping (user in the target system might have different names)
+        for e in tree.findall(".//username"):
+            e.text = config.username_mapping.get(e.text, e.text)
+
+        config.xml = et.tostring(tree, encoding="unicode")
+        # restore default namespace definition
+        config.xml = config.xml.replace(
+            '_xmlns="http://www.mediawiki.org', 'xmlns="http://www.mediawiki.org'
+        )
+
+        api_url = (
+            self.wtSite._site.scheme
+            + "://"
+            + self.wtSite._site.host
+            + self.wtSite._site.path
+            + "api.php"
+        )
+        response = self.wtSite._site.connection.post(
+            url=api_url,
+            data={
+                "action": "import",
+                "token": self.wtSite._site.get_token("csrf"),
+                "fullhistory": "1" if config.full_history else "0",
+                "templates": "1" if config.include_templates else "0",
+                "assignknownusers": "1",
+                "interwikiprefix": config.source_domain,
+                # "namespace": self.title.split(":")[0],
+                "summary": config.summary,
+                "format": "json",
+            },
+            files={
+                "xml": (
+                    "xml",
+                    StringIO(config.xml),
+                    "text/xml",
+                )  # read config.xml as file
+            },
+        )
+
+        json = response.json()
+        if "error" in json:
+            # print("Error: ", json)
+            return WtPage.ImportResult(success=False, error_msg=json["error"]["info"])
+        else:
+            # print("Imported: ", json["import"][0]["title"], " with ", json["import"][0]["revisions"], " revisions")
+            return WtPage.ImportResult(
+                success=True,
+                imported_title=json["import"][0]["title"],
+                imported_revisions=json["import"][0]["revisions"],
+            )
+
 
 # Updating forwards refs in pydantic models
 WtSite.UploadPageParam.update_forward_refs()
diff --git a/tests/integration/test_page_export_import.py b/tests/integration/test_page_export_import.py
@@ -0,0 +1,40 @@
+import uuid
+
+from osw.auth import CredentialManager
+from osw.core import OSW
+from osw.wtsite import WtPage, WtSite
+
+# run with: tox -e test -- --wiki_domain domain --wiki_username user --wiki_password pass
+
+
+def _test_ontology_import(wiki_domain, wiki_username, wiki_password):
+    """this test does not run with a bot account"""
+    cm = CredentialManager()
+    cm.add_credential(
+        CredentialManager.UserPwdCredential(
+            iri=wiki_domain, username=wiki_username, password=wiki_password
+        )
+    )
+    wtsite = WtSite(WtSite.WtSiteConfig(iri=wiki_domain, cred_mngr=cm))
+
+    osw = OSW(site=wtsite)
+
+    p = osw.site.get_page(WtSite.GetPageParam(titles=["Main_Page"])).pages[0]
+    res = p.export_xml()
+    assert res.success is True
+
+    p2 = osw.site.get_page(
+        WtSite.GetPageParam(titles=["Item:" + OSW.get_osw_id(uuid.uuid4())])
+    ).pages[0]
+    res2 = p2.import_xml(
+        WtPage.ImportConfig(
+            xml=res.xml,
+            summary="test import",
+            source_domain="wiki-dev.open-semantic-lab.org",
+        )
+    )
+    assert res2.success is True
+    assert res2.imported_title == p2.title
+    assert res2.imported_revisions > 0
+
+    p2.delete()