Coverage for sphinxlint/utils.py: 95%

131 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-24 18:46 +0100

1"""Just a bunch of utility functions for sphinxlint.""" 

2from functools import lru_cache 

3 

4import regex as re 

5from polib import pofile 

6 

7from sphinxlint import rst 

8 

9 

10PER_FILE_CACHES = [] 

11 

12 

13def per_file_cache(func): 

14 memoized_func = lru_cache(maxsize=None)(func) 

15 PER_FILE_CACHES.append(memoized_func) 

16 return memoized_func 

17 

18 

19def match_size(re_match): 

20 return re_match.end() - re_match.start() 

21 

22 

23def _clean_heuristic(paragraph, regex): 

24 """Remove the regex from the paragraph. 

25 

26 The remove starts by most "credible" ones (here lies the dragons). 

27 

28 To remove `(.*)` from `(abc def ghi (jkl)`, a bad move consists of 

29 removing everything (eating a lone `(`), while the most credible 

30 action to take is to remove `(jkl)`, leaving a lone `(`. 

31 """ 

32 while True: 

33 candidate = min( 

34 regex.finditer(paragraph, overlapped=True), key=match_size, default=None 

35 ) 

36 if candidate is None: 

37 return paragraph 

38 paragraph = paragraph[: candidate.start()] + paragraph[candidate.end() :] 

39 

40 

41@per_file_cache 

42def clean_paragraph(paragraph): 

43 """Removes all good constructs, so detectors can focus on bad ones. 

44 

45 It removes all well formed inline literals, inline internal 

46 targets, and roles. 

47 """ 

48 paragraph = escape2null(paragraph) 

49 paragraph = _clean_heuristic(paragraph, rst.INLINE_LITERAL_RE) 

50 paragraph = _clean_heuristic(paragraph, rst.INLINE_INTERNAL_TARGET_RE) 

51 paragraph = _clean_heuristic(paragraph, rst.HYPERLINK_REFERENCES_RE) 

52 paragraph = _clean_heuristic(paragraph, rst.ANONYMOUS_HYPERLINK_REFERENCES_RE) 

53 paragraph = rst.NORMAL_ROLE_RE.sub("", paragraph) 

54 return paragraph.replace("\x00", "\\") 

55 

56 

57@per_file_cache 

58def escape2null(text): 

59 r"""Return a string with escape-backslashes converted to nulls. 

60 

61 It ease telling appart escaping-backslashes and normal backslashes 

62 in regex. 

63 

64 For example : \\\\\\` is hard to match, even with the eyes, it's 

65 hard to know which backslash escapes which backslash, and it's 

66 very hard to know if the backtick is escaped. 

67 

68 By replacing the escaping backslashes with another character they 

69 become easy to spot: 

70 

71 0\0\0\` 

72 

73 (This example uses zeros for readability but the function actually 

74 uses null bytes, \x00.) 

75 

76 So we easily see that the backtick is **not** escaped: it's 

77 preceded by a backslash, not an escaping backslash. 

78 """ 

79 parts = [] 

80 start = 0 

81 while True: 

82 found = text.find("\\", start) 

83 if found == -1: 

84 parts.append(text[start:]) 

85 return "".join(parts) 

86 parts.append(text[start:found]) 

87 parts.append("\x00" + text[found + 1 : found + 2]) 

88 start = found + 2 # skip character after escape 

89 

90 

91@per_file_cache 

92def paragraphs(lines): 

93 """Yield (paragraph_line_no, paragraph_text) pairs describing 

94 paragraphs of the given lines. 

95 """ 

96 output = [] 

97 paragraph = [] 

98 paragraph_lno = 1 

99 for lno, line in enumerate(lines, start=1): 

100 if line != "\n": 

101 if not paragraph: 

102 # save the lno of the first line of the para 

103 paragraph_lno = lno 

104 paragraph.append(line) 

105 elif paragraph: 

106 output.append((paragraph_lno, "".join(paragraph))) 

107 paragraph = [] 

108 if paragraph: 

109 output.append((paragraph_lno, "".join(paragraph))) 

110 return tuple(output) 

111 

112 

113def looks_like_glued(match): 

114 """Tell appart glued tags and tags with a missing colon. 

115 

116 In one case we can have: 

117 

118 the:issue:`123`, it's clearly a missing space before the role tag. 

119 

120 should return True in this case. 

121 

122 In another case we can have: 

123 

124 c:func:`foo`, it's a missing colon before the tag. 

125 

126 should return False in this case. 

127 """ 

128 match_string = match.group(0) 

129 if match_string.count(":") == 1: 

130 # With a single : there's no choice, another : is missing. 

131 return False 

132 known_start_tag = {"c", "py"} 

133 if re.match(" *(" + "|".join(known_start_tag) + "):", match_string): 

134 # Before c:anything:` or py:anything:` we can bet it's a missing colon. 

135 return False 

136 # In other cases it's probably a glued word. 

137 return True 

138 

139 

140_START_OF_COMMENT_BLOCK_RE = re.compile(r"^\s*\.\.$") 

141_PRODUCTION_LIST_DIRECTIVE_RE = re.compile(r"^ *.. productionlist::") 

142_COMMENT_RE = re.compile(r"^ *\.\. ") 

143 

144 

145def is_multiline_non_rst_block(line): 

146 """Returns True if the next lines are an indented literal block.""" 

147 if _START_OF_COMMENT_BLOCK_RE.search(line): 

148 return True 

149 if rst.DIRECTIVES_CONTAINING_RST_RE.match(line): 

150 return False 

151 if rst.DIRECTIVES_CONTAINING_ARBITRARY_CONTENT_RE.match(line): 

152 return True 

153 if _PRODUCTION_LIST_DIRECTIVE_RE.search(line): 

154 return True 

155 if _COMMENT_RE.search(line) and type_of_explicit_markup(line) == "comment": 

156 return True 

157 if line.endswith("::\n"): # It's a literal block 

158 return True 

159 return False 

160 

161 

162_ZERO_OR_MORE_SPACES_RE = re.compile(" *") 

163 

164 

165def hide_non_rst_blocks(lines, hidden_block_cb=None): 

166 """Filters out literal, comments, code blocks, ... 

167 

168 The filter actually replace "removed" lines by empty lines, so the 

169 line numbering still make sense. 

170 """ 

171 in_literal = None 

172 excluded_lines = [] 

173 block_line_start = None 

174 output = [] 

175 for lineno, line in enumerate(lines, start=1): 

176 if in_literal is not None: 

177 current_indentation = len(_ZERO_OR_MORE_SPACES_RE.match(line)[0]) 

178 if current_indentation > in_literal or line == "\n": 

179 excluded_lines.append(line if line == "\n" else line[in_literal:]) 

180 line = "\n" # Hiding line 

181 else: 

182 in_literal = None 

183 if hidden_block_cb: 

184 hidden_block_cb(block_line_start, "".join(excluded_lines)) 

185 excluded_lines = [] 

186 if in_literal is None and is_multiline_non_rst_block(line): 

187 in_literal = len(_ZERO_OR_MORE_SPACES_RE.match(line)[0]) 

188 block_line_start = lineno 

189 assert not excluded_lines 

190 if type_of_explicit_markup(line) == "comment" and _COMMENT_RE.search(line): 

191 line = "\n" 

192 output.append(line) 

193 if excluded_lines and hidden_block_cb: 

194 hidden_block_cb(block_line_start, "".join(excluded_lines)) 

195 return tuple(output) 

196 

197 

198_starts_with_directive_marker = re.compile(rf"\.\. {rst.ALL_DIRECTIVES}::").match 

199_starts_with_footnote_marker = re.compile(r"\.\. \[[0-9]+\] ").match 

200_starts_with_citation_marker = re.compile(r"\.\. \[[^\]]+\] ").match 

201_starts_with_target = re.compile(r"\.\. _.*[^_]: ").match 

202_starts_with_substitution_definition = re.compile(r"\.\. \|[^\|]*\| ").match 

203 

204 

205@per_file_cache 

206def type_of_explicit_markup(line): 

207 """Tell apart various explicit markup blocks.""" 

208 line = line.lstrip() 

209 if _starts_with_directive_marker(line): 

210 return "directive" 

211 if _starts_with_footnote_marker(line): 

212 return "footnote" 

213 if _starts_with_citation_marker(line): 

214 return "citation" 

215 if _starts_with_target(line): 

216 return "target" 

217 if _starts_with_substitution_definition(line): 

218 return "substitution_definition" 

219 return "comment" 

220 

221 

222def po2rst(text): 

223 """Extract msgstr entries from a po content, keeping linenos.""" 

224 output = [] 

225 po = pofile(text, encoding="UTF-8") 

226 for entry in po.translated_entries(): 

227 # Don't check original msgid, assume it's checked directly. 

228 while len(output) + 1 < entry.linenum: 

229 output.append("\n") 

230 for line in entry.msgstr.splitlines(): 

231 output.append(line + "\n") 

232 return "".join(output)