Skip to content

Commit 0a42684

Browse files
WIP scrape.py to parse event pages... This is tiring :)
1 parent 2c9987f commit 0a42684

File tree

1 file changed

+66
-7
lines changed

1 file changed

+66
-7
lines changed

migrate/oldwiki/scrape.py

Lines changed: 66 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,17 +103,76 @@ def convert_page_to_yaml(page_url: str, name: str, source: str) -> str:
103103
event_description = convert_to_markdown(str(p))
104104
break
105105
if not event_description:
106-
raise ValueError(f"Could not find a valid description for {name}.")
106+
raise ValueError(f"Could not find a valid description for {name}")
107+
108+
event_source = None
109+
source_header = content_div.find("span", id="Source")
110+
if source_header:
111+
source_paragraph = source_header.find_next("p")
112+
if source_paragraph:
113+
source_text = source_paragraph.get_text().strip()
114+
if source_text:
115+
# Remove new lines from the source text
116+
source_text = source_text.replace("\n", " ")
117+
event_source = {
118+
"type": "element",
119+
"description": source_text
120+
}
121+
if not event_source:
122+
raise ValueError(f"Could not find a valid source for {name}")
123+
124+
# Event parameters are optional, there may be none
125+
event_parameters = []
126+
# <h2><span class="mw-headline" id="Parameters">Parameters</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/wiki/OnElementDataChange?action=edit&amp;section=1" title="Edit section: Parameters">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
127+
# <pre class="prettyprint lang-lua">string theKey, var oldValue, var newValue
128+
# </pre>
129+
# <ul><li><b>theKey</b>: The name of the element data entry that has changed.</li>
130+
# <li><b>oldValue</b>: The old value of this entry before it changed. See <a href="/wiki/Element_data" title="Element data">element data</a> for a list of possible datatypes.</li>
131+
# <li><b>newValue</b>: the new value of this entry after it changed. This will be equivalent to <a href="/wiki/GetElementData" title="GetElementData">getElementData</a>(source, theKey).</li></ul>
132+
parameters_header = content_div.find("span", id="Parameters")
133+
if parameters_header:
134+
params = []
135+
# Find the next <pre> tag after the parameters header
136+
pre_tag = parameters_header.find_next("pre")
137+
if pre_tag:
138+
# Extract the text from the <pre> tag
139+
pre_text = pre_tag.get_text().strip()
140+
# Split the text by commas to get individual parameters
141+
param_lines = pre_text.split(",")
142+
for line in param_lines:
143+
line = line.strip()
144+
if line:
145+
# Split by space to get type and name
146+
parts = line.split(" ", 1)
147+
if len(parts) == 2:
148+
param_type, param_name = parts
149+
params.append({
150+
"name": param_name.strip(),
151+
"type": param_type.strip(),
152+
"description": "TODO" # Placeholder for now
153+
})
154+
# Get the parameters descriptions
155+
params_list = parameters_header.find_next("ul")
156+
if params_list:
157+
for li in params_list.find_all("li"):
158+
b_tag = li.find("b")
159+
if b_tag:
160+
param_name = b_tag.text.strip()
161+
for param in params:
162+
if param["name"] == param_name:
163+
# Split by : to get the description
164+
description = li.get_text().split(":", 1)
165+
if len(description) > 1:
166+
param["description"] = description[1].strip()
167+
event_parameters = params
107168

108169
yaml_dict = {
109170
"incomplete": True,
110171
"name": name,
111172
"type": "client" if "Client" in source else "server",
112-
"source_element": {
113-
"type": "element",
114-
"description": "TODO"
115-
},
116-
"description": event_description.strip()
173+
"source_element": event_source,
174+
"description": event_description.strip(),
175+
"parameters": event_parameters
117176
}
118177
yaml_content = yaml.safe_dump(yaml_dict,
119178
sort_keys=False,
@@ -157,7 +216,7 @@ def write_yaml_per_entry(base_dir, data_by_source):
157216
file_content += convert_page_to_yaml(page_url, name, source)
158217
f.write(file_content)
159218
except Exception as e:
160-
print(f"Error processing {name} from {page_url}: {e}")
219+
print(e)
161220
# Cancel and continue to next entry, closing/deleting file if needed
162221
if os.path.exists(filename):
163222
os.remove(filename)

0 commit comments

Comments
 (0)