Skip to content

Commit 60d8db9

Browse files
author
awu42
committed
Simplified validate_rst_title_capitalization.py to print correctly (pandas-dev#26941)
1 parent d71be41 commit 60d8db9

File tree

1 file changed

+41
-225
lines changed

1 file changed

+41
-225
lines changed

scripts/validate_rst_title_capitalization.py

Lines changed: 41 additions & 225 deletions
Original file line numberDiff line numberDiff line change
@@ -12,67 +12,16 @@
1212
./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst
1313
./scripts/validate_rst_title_capitalization.py doc/source/
1414
15-
Files that cannot be validated: (code crashes when validating for some reason)
16-
doc/source/user_guide/io.rst
17-
doc/source/whatsnew/v0.17.1.rst
18-
19-
Reference: doctree elements
20-
http://epydoc.sourceforge.net/docutils/public/docutils.nodes.Element-class.html
21-
2215
"""
2316

2417
import argparse
2518
import sys
26-
from docutils.parsers.rst import Parser
27-
import docutils
28-
from docutils import nodes
2919
import re
3020
import os
3121
from os import walk
3222
from typing import Generator, List
3323

3424

35-
class suppress_stdout_stderr:
36-
'''
37-
Code source:
38-
https://stackoverflow.com/questions/11130156/
39-
40-
A context manager for doing a "deep suppression" of stdout and stderr in
41-
Python, i.e. will suppress all print, even if the print originates in a
42-
compiled C/Fortran sub-function.
43-
This will not suppress raised exceptions, since exceptions are printed
44-
to stderr just before a script exits, and after the context manager has
45-
exited (at least, I think that is why it lets exceptions through).
46-
47-
This code is needed to suppress output from the parser method
48-
because the parser method prints to stdout when encountering Sphinx
49-
references, as it cannot parse those at this moment.
50-
51-
'''
52-
def __init__(self):
53-
self.null_fds = [os.open(os.devnull, os.O_WRONLY) for x in range(2)]
54-
self.save_fds = [os.dup(1), os.dup(2)]
55-
56-
def __enter__(self):
57-
'''
58-
Assign the null pointers to stdout and stderr.
59-
60-
'''
61-
os.dup2(self.null_fds[0], 1)
62-
os.dup2(self.null_fds[1], 2)
63-
64-
def __exit__(self, *_):
65-
'''
66-
Re-assign the real stdout/stderr back to (1) and (2) and close all
67-
file descriptors
68-
69-
'''
70-
os.dup2(self.save_fds[0], 1)
71-
os.dup2(self.save_fds[1], 2)
72-
for fd in self.null_fds + self.save_fds:
73-
os.close(fd)
74-
75-
7625
# Keynames that would not follow capitalization convention
7726
CAPITALIZATION_EXCEPTIONS = {
7827
'pandas', 'Python', 'IPython', 'PyTables', 'Excel', 'JSON',
@@ -88,12 +37,6 @@ def __exit__(self, *_):
8837
# Key: Document Directory, Value: Pair(Bad Title, Line Number)
8938
bad_title_dict = {}
9039

91-
# List of problematic tags that are exceptions to parent rule
92-
list_of_markers = {'emphasis', 'strong', 'reference', 'literal'}
93-
94-
# List of files that, when validated, causes the program to crash
95-
cannot_validate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst']
96-
9740
# Error Message:
9841
err_msg = 'Heading capitalization formatted incorrectly. Please correctly capitalize'
9942

@@ -152,118 +95,18 @@ def follow_capitalization_convention(title: str) -> bool:
15295
return True
15396

15497

155-
def find_line_number(node: docutils.nodes) -> int:
98+
def findTitles(rst_file: str) -> Generator[List[str], List[int], None]:
15699
'''
157-
Recursive method that finds the line number in a document for a particular node
158-
in the doctree
100+
Algorithm to identify particular text that should be considered headings in an
101+
RST file
159102
160-
Text nodes usually don't have any value for its "line" instance variable,
161-
so instead, we recursively look through the parent nodes to eventually find the
162-
correct line number, which I determined would be node.line - 1
163-
164-
Parameters
165-
----------
166-
node : docutils.node
167-
Name of the object of the docstring to validate.
168-
169-
Returns
170-
-------
171-
int
172-
The line number of the node
173-
174-
'''
175-
if (node.tagname == 'document'):
176-
return 1
177-
elif (node.line is None):
178-
return find_line_number(node.parent)
179-
else:
180-
return node.line - 1
181-
182-
183-
def parse_RST(rst_file: str) -> docutils.nodes.document:
184-
'''
185-
Method to parse through an rst_file and return a document tree
103+
See <https://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html> for details
104+
on what constitutes a string as a heading in RST
186105
187106
Parameters
188107
----------
189108
rst_file : str
190-
Directory address of a .rst file as a string
191-
192-
Returns
193-
-------
194-
document : docutils.nodes.document
195-
Root node of the .rst file's document tree
196-
197-
'''
198-
# Initialize rst Parser object
199-
parser = Parser()
200-
201-
# Open and read the .rst file and store the string of data into input
202-
f = open(rst_file, "r")
203-
input = f.read()
204-
205-
# Set up default settings for the document tree
206-
settings = docutils.frontend.OptionParser(
207-
components=(docutils.parsers.rst.Parser,)
208-
).get_default_values()
209-
210-
# Initialize an empty document tree with the default settings from above
211-
document = docutils.utils.new_document('Document', settings)
212-
213-
# Parse input into an RST doctree, suppressing any stderr from parse method
214-
parser.parse(input, document)
215-
216-
# Return the root node of the document tree
217-
return document
218-
219-
220-
def find_titles_in_doctree(document: docutils.nodes.document) -> Generator[
221-
List[str], List[int], None]:
222-
'''
223-
Algorithm to identify particular text nodes as headings
224-
along with the text node's line number.
225-
226-
The idea is that when we traverse through the text nodes, nodes whose
227-
parents have a tagname of 'title' are definitely considered to be part
228-
of headings.
229-
230-
However, the problem occurs when we encounter text that has been either
231-
italicized, bolded, referenced, etc. In these situations, the tagname of
232-
the parent node could be one of the following: 'emphasis', 'strong',
233-
'reference', and 'literal', stored in the 'list_of_markers' set variable. In
234-
this situation, the node's grandparent would have the 'title' tagname instead.
235-
236-
Let's see an example that can cause a problem. The heading provided will be
237-
'Looking at *pandas* docs' versus 'Looking at pandas docs'. In this example,
238-
the stars around pandas in the first string italicizes the word.
239-
However, the doctree would be representing both doctrees as follows:
240-
241-
'Looking at *pandas* docs' 'Looking at pandas docs'
242-
title title
243-
/ | | |
244-
#text emphasis #text VS #text
245-
| | | |
246-
'Looking at' #text 'docs' 'Looking at pandas docs'
247-
|
248-
'pandas'
249-
250-
When iterating through the nodes, we first encounter the node: 'Looking at'.
251-
However, this isn't the full line of the heading (Looking at pandas docs).
252-
We're still missing 'pandas docs'. Hence, we must store this first word into
253-
a variable (my_text in my function) and append this string variable with more
254-
words in case we encounter text that has a parent with tagname in list_of_markers.
255-
In this example, we have to go through two more nodes to get the full heading.
256-
257-
Meanwhile, when nothing has a parent with tagname in list_of_markers, we only
258-
need to access one node to find the 'Looking at the pandas docs' text.
259-
260-
My algorithm adjusts for this pattern, iterating through nodes and
261-
identifying when headings are complete.
262-
263-
Parameters
264-
----------
265-
document : docutils.nodes.document
266-
Root node of a .rst file's document tree
109+
RST file to scan through for headings
267110
268111
Returns
269112
-------
@@ -275,62 +118,42 @@ def find_titles_in_doctree(document: docutils.nodes.document) -> Generator[
275118
276119
'''
277120

278-
# my_text will be used to construct headings and append into title_list
279-
my_text: str = ""
280-
281-
# line_no will be used to retrieve line numbers of certain headings
282-
line_no: int = 0
283-
284-
# A docutils.nodes object that stores a list_of_markers text's grandparent node,
285-
# which should have a tagname of title
286-
marker_grandparent: docutils.nodes.Title = None
287-
288-
# True if the most recent node encountered had a parent with a list_of_markers
289-
# tagname and a grandparent with a tagname of title
290-
before_marker: bool = False
291-
292121
# title_list is the list of headings that is encountered in the doctree
293122
title_list: List[str] = []
294123

295124
# List of line numbers that corresponding headings in title_list can be found at
296125
line_number_list: List[int] = []
297126

298-
# Traverse through the nodes.Text in the document tree to construct headings
299-
for node in document.traverse(nodes.Text):
300-
# Case 1: Encounter a node with a parent tagname of title
301-
if (node.parent.tagname == 'title'):
302-
if (before_marker and marker_grandparent == node.parent):
303-
my_text = my_text + node.astext()
304-
before_marker = False
305-
else:
306-
if (my_text != ""):
307-
title_list.append(my_text)
308-
line_number_list.append(line_no)
309-
line_no = find_line_number(node)
310-
my_text = node.astext()
311-
before_marker = False
312-
# Case 2: Encounter a node with parent tagname in list_of_markers
313-
elif (node.parent.parent.tagname == 'title' and
314-
node.parent.tagname in list_of_markers):
315-
line_no = find_line_number(node)
316-
my_text = my_text + node.astext()
317-
before_marker = True
318-
marker_grandparent = node.parent.parent
319-
# Case 3: Encounter parent tagname of none of the above (Ex. 'paragraph')
320-
else:
321-
before_marker = False
322-
if (my_text != ""):
323-
title_list.append(my_text)
324-
line_number_list.append(line_no)
325-
my_text = ""
326-
line_no = 0
327-
328-
# Leftover string that hasn't been appended yet due to how the for loop works
329-
if (my_text != ""):
330-
title_list.append(my_text)
331-
line_number_list.append(line_no)
332-
333-
# Return a list of the headings and a list of their corresponding line numbers
127+
# Open and read the .rst file and store the string of data into input
128+
f = open(rst_file, "r")
129+
input = f.read().split('\n')
130+
131+
# Regular expressions that denote a title beforehand
132+
regex = {
133+
'*': r'^(?:\*{1})*$', '=': r'^(?:={1})*$', '-': r'^(?:-{1})*$',
134+
'^': r'^(?:\^{1})*$', '~': r'^(?:~{1})*$', '#': r'^(?:#{1})*$',
135+
'"': r'^(?:"{1})*$'
136+
}
137+
138+
# '*`_' markers are removed from original string text.
139+
table = str.maketrans("", "", '*`_')
140+
141+
# Loop through input lines, appending if they are considered headings
142+
for lineno in range(1, len(input)):
143+
if (len(input[lineno]) != 0 and len(input[lineno - 1]) != 0):
144+
for key in regex:
145+
match = re.search(regex[key], input[lineno])
146+
if (match is not None):
147+
if (lineno >= 2):
148+
if (input[lineno] == input[lineno - 2]):
149+
if (len(input[lineno]) == len(input[lineno - 1])):
150+
title_list.append(input[lineno - 1].translate(table))
151+
line_number_list.append(lineno)
152+
break
153+
if (len(input[lineno]) >= len(input[lineno - 1])):
154+
title_list.append(input[lineno - 1].translate(table))
155+
line_number_list.append(lineno)
156+
334157
return title_list, line_number_list
335158

336159

@@ -345,19 +168,12 @@ def fill_bad_title_dict(rst_file: str) -> None:
345168
346169
'''
347170

348-
# Ensure file isn't one that causes the code to crash
349-
if rst_file in cannot_validate:
350-
return
351-
352171
# Ensure this file doesn't already have a bad_title_dict slot
353172
if rst_file in bad_title_dict:
354173
return
355174

356-
# Parse rst_file with an RST parser
357-
document = parse_RST(rst_file)
358-
359-
# Make a list of headings along with their line numbers from document tree
360-
title_list, line_number_list = find_titles_in_doctree(document)
175+
# Make a list of headings along with their line numbers
176+
title_list, line_number_list = findTitles(rst_file)
361177

362178
# Append the bad_title_dict if the capitalization convention not followed
363179
for i in range(len(title_list)):
@@ -424,9 +240,9 @@ def main(source_paths: List[str], output_format: str) -> bool:
424240
directory_list = find_rst_files(source_paths)
425241

426242
# Fill the bad_title_dict, which contains all incorrectly capitalized headings
427-
with suppress_stdout_stderr():
428-
for filename in directory_list:
429-
fill_bad_title_dict(filename)
243+
# with suppress_stdout_stderr():
244+
for filename in directory_list:
245+
fill_bad_title_dict(filename)
430246

431247
# Return an exit status of 0 if there are no bad titles in the dictionary
432248
if (len(bad_title_dict) == 0):

0 commit comments

Comments
 (0)