Coverage for sphinxlint/utils.py: 95%
131 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-24 18:46 +0100
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-24 18:46 +0100
1"""Just a bunch of utility functions for sphinxlint."""
2from functools import lru_cache
4import regex as re
5from polib import pofile
7from sphinxlint import rst
10PER_FILE_CACHES = []
13def per_file_cache(func):
14 memoized_func = lru_cache(maxsize=None)(func)
15 PER_FILE_CACHES.append(memoized_func)
16 return memoized_func
19def match_size(re_match):
20 return re_match.end() - re_match.start()
23def _clean_heuristic(paragraph, regex):
24 """Remove the regex from the paragraph.
26 The remove starts by most "credible" ones (here lies the dragons).
28 To remove `(.*)` from `(abc def ghi (jkl)`, a bad move consists of
29 removing everything (eating a lone `(`), while the most credible
30 action to take is to remove `(jkl)`, leaving a lone `(`.
31 """
32 while True:
33 candidate = min(
34 regex.finditer(paragraph, overlapped=True), key=match_size, default=None
35 )
36 if candidate is None:
37 return paragraph
38 paragraph = paragraph[: candidate.start()] + paragraph[candidate.end() :]
41@per_file_cache
42def clean_paragraph(paragraph):
43 """Removes all good constructs, so detectors can focus on bad ones.
45 It removes all well formed inline literals, inline internal
46 targets, and roles.
47 """
48 paragraph = escape2null(paragraph)
49 paragraph = _clean_heuristic(paragraph, rst.INLINE_LITERAL_RE)
50 paragraph = _clean_heuristic(paragraph, rst.INLINE_INTERNAL_TARGET_RE)
51 paragraph = _clean_heuristic(paragraph, rst.HYPERLINK_REFERENCES_RE)
52 paragraph = _clean_heuristic(paragraph, rst.ANONYMOUS_HYPERLINK_REFERENCES_RE)
53 paragraph = rst.NORMAL_ROLE_RE.sub("", paragraph)
54 return paragraph.replace("\x00", "\\")
57@per_file_cache
58def escape2null(text):
59 r"""Return a string with escape-backslashes converted to nulls.
61 It ease telling appart escaping-backslashes and normal backslashes
62 in regex.
64 For example : \\\\\\` is hard to match, even with the eyes, it's
65 hard to know which backslash escapes which backslash, and it's
66 very hard to know if the backtick is escaped.
68 By replacing the escaping backslashes with another character they
69 become easy to spot:
71 0\0\0\`
73 (This example uses zeros for readability but the function actually
74 uses null bytes, \x00.)
76 So we easily see that the backtick is **not** escaped: it's
77 preceded by a backslash, not an escaping backslash.
78 """
79 parts = []
80 start = 0
81 while True:
82 found = text.find("\\", start)
83 if found == -1:
84 parts.append(text[start:])
85 return "".join(parts)
86 parts.append(text[start:found])
87 parts.append("\x00" + text[found + 1 : found + 2])
88 start = found + 2 # skip character after escape
91@per_file_cache
92def paragraphs(lines):
93 """Yield (paragraph_line_no, paragraph_text) pairs describing
94 paragraphs of the given lines.
95 """
96 output = []
97 paragraph = []
98 paragraph_lno = 1
99 for lno, line in enumerate(lines, start=1):
100 if line != "\n":
101 if not paragraph:
102 # save the lno of the first line of the para
103 paragraph_lno = lno
104 paragraph.append(line)
105 elif paragraph:
106 output.append((paragraph_lno, "".join(paragraph)))
107 paragraph = []
108 if paragraph:
109 output.append((paragraph_lno, "".join(paragraph)))
110 return tuple(output)
113def looks_like_glued(match):
114 """Tell appart glued tags and tags with a missing colon.
116 In one case we can have:
118 the:issue:`123`, it's clearly a missing space before the role tag.
120 should return True in this case.
122 In another case we can have:
124 c:func:`foo`, it's a missing colon before the tag.
126 should return False in this case.
127 """
128 match_string = match.group(0)
129 if match_string.count(":") == 1:
130 # With a single : there's no choice, another : is missing.
131 return False
132 known_start_tag = {"c", "py"}
133 if re.match(" *(" + "|".join(known_start_tag) + "):", match_string):
134 # Before c:anything:` or py:anything:` we can bet it's a missing colon.
135 return False
136 # In other cases it's probably a glued word.
137 return True
140_START_OF_COMMENT_BLOCK_RE = re.compile(r"^\s*\.\.$")
141_PRODUCTION_LIST_DIRECTIVE_RE = re.compile(r"^ *.. productionlist::")
142_COMMENT_RE = re.compile(r"^ *\.\. ")
145def is_multiline_non_rst_block(line):
146 """Returns True if the next lines are an indented literal block."""
147 if _START_OF_COMMENT_BLOCK_RE.search(line):
148 return True
149 if rst.DIRECTIVES_CONTAINING_RST_RE.match(line):
150 return False
151 if rst.DIRECTIVES_CONTAINING_ARBITRARY_CONTENT_RE.match(line):
152 return True
153 if _PRODUCTION_LIST_DIRECTIVE_RE.search(line):
154 return True
155 if _COMMENT_RE.search(line) and type_of_explicit_markup(line) == "comment":
156 return True
157 if line.endswith("::\n"): # It's a literal block
158 return True
159 return False
162_ZERO_OR_MORE_SPACES_RE = re.compile(" *")
165def hide_non_rst_blocks(lines, hidden_block_cb=None):
166 """Filters out literal, comments, code blocks, ...
168 The filter actually replace "removed" lines by empty lines, so the
169 line numbering still make sense.
170 """
171 in_literal = None
172 excluded_lines = []
173 block_line_start = None
174 output = []
175 for lineno, line in enumerate(lines, start=1):
176 if in_literal is not None:
177 current_indentation = len(_ZERO_OR_MORE_SPACES_RE.match(line)[0])
178 if current_indentation > in_literal or line == "\n":
179 excluded_lines.append(line if line == "\n" else line[in_literal:])
180 line = "\n" # Hiding line
181 else:
182 in_literal = None
183 if hidden_block_cb:
184 hidden_block_cb(block_line_start, "".join(excluded_lines))
185 excluded_lines = []
186 if in_literal is None and is_multiline_non_rst_block(line):
187 in_literal = len(_ZERO_OR_MORE_SPACES_RE.match(line)[0])
188 block_line_start = lineno
189 assert not excluded_lines
190 if type_of_explicit_markup(line) == "comment" and _COMMENT_RE.search(line):
191 line = "\n"
192 output.append(line)
193 if excluded_lines and hidden_block_cb:
194 hidden_block_cb(block_line_start, "".join(excluded_lines))
195 return tuple(output)
198_starts_with_directive_marker = re.compile(rf"\.\. {rst.ALL_DIRECTIVES}::").match
199_starts_with_footnote_marker = re.compile(r"\.\. \[[0-9]+\] ").match
200_starts_with_citation_marker = re.compile(r"\.\. \[[^\]]+\] ").match
201_starts_with_target = re.compile(r"\.\. _.*[^_]: ").match
202_starts_with_substitution_definition = re.compile(r"\.\. \|[^\|]*\| ").match
205@per_file_cache
206def type_of_explicit_markup(line):
207 """Tell apart various explicit markup blocks."""
208 line = line.lstrip()
209 if _starts_with_directive_marker(line):
210 return "directive"
211 if _starts_with_footnote_marker(line):
212 return "footnote"
213 if _starts_with_citation_marker(line):
214 return "citation"
215 if _starts_with_target(line):
216 return "target"
217 if _starts_with_substitution_definition(line):
218 return "substitution_definition"
219 return "comment"
222def po2rst(text):
223 """Extract msgstr entries from a po content, keeping linenos."""
224 output = []
225 po = pofile(text, encoding="UTF-8")
226 for entry in po.translated_entries():
227 # Don't check original msgid, assume it's checked directly.
228 while len(output) + 1 < entry.linenum:
229 output.append("\n")
230 for line in entry.msgstr.splitlines():
231 output.append(line + "\n")
232 return "".join(output)