Coverage for sphinxlint/rst.py: 100%

53 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-24 18:46 +0100

1"""Constants, regexes, and function generating regexes to "parse" reStructuredText. 

2 

3In this file: 

4- All constants are ALL_CAPS 

5- All compiled regexes are suffixed by _RE 

6""" 

7from functools import lru_cache 

8 

9import regex as re 

10 

11DELIMITERS = ( 

12 "\\-/:\u058a\xa1\xb7\xbf\u037e\u0387\u055a-\u055f\u0589" 

13 "\u05be\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c" 

14 "\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d" 

15 "\u07f7-\u07f9\u0830-\u083e\u0964\u0965\u0970\u0df4\u0e4f" 

16 "\u0e5a\u0e5b\u0f04-\u0f12\u0f85\u0fd0-\u0fd4\u104a-\u104f" 

17 "\u10fb\u1361-\u1368\u1400\u166d\u166e\u16eb-\u16ed\u1735" 

18 "\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u180a\u1944\u1945" 

19 "\u19de\u19df\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-" 

20 "\u1b60\u1c3b-\u1c3f\u1c7e\u1c7f\u1cd3\u2010-\u2017\u2020-" 

21 "\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-" 

22 "\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2e00" 

23 "\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e1b\u2e1e\u2e1f\u2e2a-" 

24 "\u2e2e\u2e30\u2e31\u3001-\u3003\u301c\u3030\u303d\u30a0" 

25 "\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7" 

26 "\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f" 

27 "\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uabeb" 

28 "\ufe10-\ufe16\ufe19\ufe30-\ufe32\ufe45\ufe46\ufe49-\ufe4c" 

29 "\ufe50-\ufe52\ufe54-\ufe58\ufe5f-\ufe61\ufe63\ufe68\ufe6a" 

30 "\ufe6b\uff01-\uff03\uff05-\uff07\uff0a\uff0c-\uff0f\uff1a" 

31 "\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65" 

32) 

33 

34CLOSERS = ( 

35 "\"')>\\]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a\u2769" 

36 "\u276b\u276d\u276f\u2771\u2773\u2775\u27c6\u27e7\u27e9\u27eb" 

37 "\u27ed\u27ef\u2984\u2986\u2988\u298a\u298c\u298e\u2990\u2992" 

38 "\u2994\u2996\u2998\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29" 

39 "\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e" 

40 "\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42" 

41 "\ufe44\ufe48\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff60\uff63" 

42 "\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d\u2e1d\u2e21" 

43 "\u201b\u201f\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c" 

44 "\u2e1c\u2e20\u201a\u201e" 

45) 

46 

47OPENERS = ( 

48 "\"'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768" 

49 "\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea" 

50 "\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991" 

51 "\u2993\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26\u2e28" 

52 "\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d" 

53 "\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41" 

54 "\ufe43\ufe47\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff5f\uff62" 

55 "\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c\u2e1c\u2e20" 

56 "\u201a\u201e\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d" 

57 "\u2e1d\u2e21\u201b\u201f" 

58) 

59 

60# fmt: off 

61DIRECTIVES_CONTAINING_RST = [ 

62 # standard docutils ones 

63 'admonition', 'attention', 'caution', 'class', 'compound', 'container', 

64 'danger', 'epigraph', 'error', 'figure', 'footer', 'header', 'highlights', 

65 'hint', 'image', 'important', 'include', 'line-block', 'list-table', 'meta', 

66 'note', 'parsed-literal', 'pull-quote', 'replace', 'sidebar', 'tip', 'topic', 

67 'warning', 

68 # Sphinx and Python docs custom ones 

69 'acks', 'attribute', 'autoattribute', 'autoclass', 'autodata', 

70 'autoexception', 'autofunction', 'automethod', 'automodule', 

71 'availability', 'centered', 'cfunction', 'class', 'classmethod', 'cmacro', 

72 'cmdoption', 'cmember', 'confval', 'cssclass', 'ctype', 

73 'currentmodule', 'cvar', 'data', 'decorator', 'decoratormethod', 

74 'deprecated-removed', 'deprecated(?!-removed)', 'describe', 'directive', 

75 'envvar', 'event', 'exception', 'function', 'glossary', 

76 'highlight', 'highlightlang', 'impl-detail', 'index', 'literalinclude', 

77 'method', 'miscnews', 'module', 'moduleauthor', 'opcode', 'pdbcommand', 

78 'program', 'role', 'sectionauthor', 'seealso', 

79 'sourcecode', 'staticmethod', 'tabularcolumns', 'testcode', 'testoutput', 

80 'testsetup', 'toctree', 'todo', 'todolist', 'versionadded', 

81 'versionchanged', 'c:function', 'coroutinefunction' 

82] 

83 

84DIRECTIVES_CONTAINING_ARBITRARY_CONTENT = [ 

85 # standard docutils ones 

86 'contents', 'csv-table', 'date', 'default-role', 'include', 'raw', 

87 'restructuredtext-test-directive', 'role', 'rubric', 'sectnum', 'table', 

88 'target-notes', 'title', 'unicode', 

89 # Sphinx and Python docs custom ones 

90 'code-block', 'doctest', 'productionlist', 

91] 

92 

93# fmt: on 

94 

95DIRECTIVES_CONTAINING_ARBITRARY_CONTENT_RE = re.compile( 

96 r"^\s*\.\. (" + "|".join(DIRECTIVES_CONTAINING_ARBITRARY_CONTENT) + ")::" 

97) 

98 

99DIRECTIVES_CONTAINING_RST_RE = re.compile( 

100 r"^\s*\.\. (" + "|".join(DIRECTIVES_CONTAINING_RST) + ")::" 

101) 

102 

103ALL_DIRECTIVES = ( 

104 "(" 

105 + "|".join(DIRECTIVES_CONTAINING_RST + DIRECTIVES_CONTAINING_ARBITRARY_CONTENT) 

106 + ")" 

107) 

108 

109QUOTE_PAIRS = [ 

110 "»»", # Swedish 

111 "‘‚", # Albanian/Greek/Turkish 

112 "’’", # Swedish 

113 "‚‘", # German 

114 "‚’", # Polish 

115 "“„", # Albanian/Greek/Turkish 

116 "„“", # German 

117 "„”", # Polish 

118 "””", # Swedish 

119 "››", # Swedish 

120 "''", # ASCII 

121 '""', # ASCII 

122 "<>", # ASCII 

123 "()", # ASCII 

124 "[]", # ASCII 

125 "{}", # ASCII 

126] 

127 

128 

129QUOTE_PAIRS_NEGATIVE_LOOKBEHIND = ( 

130 "(?<!" 

131 + "|".join(f"{re.escape(pair[0])}`{re.escape(pair[1])}" for pair in QUOTE_PAIRS) 

132 + "|" 

133 + "|".join( 

134 f"{opener}`{closer}" 

135 for opener, closer in zip(map(re.escape, OPENERS), map(re.escape, CLOSERS)) 

136 ) 

137 + ")" 

138) 

139 

140SIMPLENAME = r"(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*" 

141 

142# The following chars groups are from docutils: 

143CLOSING_DELIMITERS = "\\\\.,;!?" 

144 

145BEFORE_ROLE = r"(^|(?<=[\s(/'{\[*-]))" 

146ROLE_TAG = rf":{SIMPLENAME}:" 

147ROLE_HEAD = rf"({BEFORE_ROLE}:{SIMPLENAME}:)" # A role, with a clean start 

148 

149ASCII_ALLOWED_BEFORE_INLINE_MARKUP = r"""-:/'"<(\[{""" 

150UNICODE_ALLOWED_BEFORE_INLINE_MARKUP = r"\p{Ps}\p{Pi}\p{Pf}\p{Pd}\p{Po}" 

151ASCII_ALLOWED_AFTER_INLINE_MARKUP = r"""-.,:;!?/'")\]}>""" 

152UNICODE_ALLOWED_AFTER_INLINE_MARKUP = r"\p{Pe}\p{Pi}\p{Pf}\p{Pd}\p{Po}" 

153 

154 

155@lru_cache(maxsize=None) 

156def inline_markup_gen(start_string, end_string, extra_allowed_before=""): 

157 """Generate a regex matching an inline markup. 

158 

159 inline_markup_gen('**', '**') geneates a regex matching strong 

160 emphasis inline markup. 

161 """ 

162 if extra_allowed_before: 

163 extra_allowed_before = "|" + extra_allowed_before 

164 return re.compile( 

165 rf""" 

166 (?<!\x00) # Both inline markup start-string and end-string must not be preceded by 

167 # an unescaped backslash 

168 

169 (?<= # Inline markup start-strings must: 

170 ^| # start a text block 

171 \s| # or be immediately preceded by whitespace, 

172 [{ASCII_ALLOWED_BEFORE_INLINE_MARKUP}]| # one of the ASCII characters 

173 [{UNICODE_ALLOWED_BEFORE_INLINE_MARKUP}] # or a similar non-ASCII 

174 # punctuation character. 

175 {extra_allowed_before} 

176 ) 

177 

178 (?P<inline_markup> 

179 {start_string} # Inline markup start 

180 \S # Inline markup start-strings must be immediately followed by 

181 # non-whitespace. 

182 # The inline markup end-string must be separated by at least one 

183 # character from the start-string. 

184 {QUOTE_PAIRS_NEGATIVE_LOOKBEHIND} 

185 .*? 

186 (?<=\x00\ |\S)# Inline markup end-strings must be immediately preceded 

187 # by non-whitespace. 

188 {end_string} # Inline markup end 

189 ) 

190 

191 (?= # Inline markup end-strings must 

192 $| # end a text block or 

193 \s| # be immediately followed by whitespace, 

194 \x00| 

195 [{ASCII_ALLOWED_AFTER_INLINE_MARKUP}]| # one of the ASCII characters 

196 [{UNICODE_ALLOWED_AFTER_INLINE_MARKUP}] # or a similar non-ASCII 

197 # punctuation character. 

198 ) 

199 """, 

200 flags=re.VERBOSE | re.DOTALL, 

201 ) 

202 

203 

204# https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules 

205INTERPRETED_TEXT_RE = inline_markup_gen("`", "`") 

206INLINE_INTERNAL_TARGET_RE = inline_markup_gen("_`", "`") 

207HYPERLINK_REFERENCES_RE = inline_markup_gen("`", "`_") 

208ANONYMOUS_HYPERLINK_REFERENCES_RE = inline_markup_gen("`", "`__") 

209INLINE_LITERAL_RE = inline_markup_gen("``", "``") 

210NORMAL_ROLE_RE = re.compile( 

211 rf""" 

212 (?<!\x00) # Both inline markup start-string and end-string must not be preceded by 

213 # an unescaped backslash 

214 

215 (?<= # Inline markup start-strings must: 

216 ^| # start a text block 

217 \s| # or be immediately preceded by whitespace, 

218 [{ASCII_ALLOWED_BEFORE_INLINE_MARKUP}]| # one of the ASCII characters 

219 [{UNICODE_ALLOWED_BEFORE_INLINE_MARKUP}] # or a similar non-ASCII 

220 # punctuation character. 

221 ) 

222 

223 :{SIMPLENAME}:{INTERPRETED_TEXT_RE.pattern}""", 

224 flags=re.VERBOSE | re.DOTALL, 

225) 

226 

227BACKTICK_IN_FRONT_OF_ROLE_RE = re.compile( 

228 rf"(^|\s)`:{SIMPLENAME}:{INTERPRETED_TEXT_RE.pattern}", flags=re.VERBOSE | re.DOTALL 

229) 

230 

231# Find comments that look like a directive, like: 

232# .. versionchanged 3.6 

233# or 

234# .. versionchanged: 3.6 

235# as it should be: 

236# .. versionchanged:: 3.6 

237SEEMS_DIRECTIVE_RE = re.compile(rf"^\s*(?<!\.)\.\. {ALL_DIRECTIVES}([^a-z:]|:(?!:))") 

238 

239# Find directive prefixed with three dots instead of two, like: 

240# ... versionchanged:: 3.6 

241# instead of: 

242# .. versionchanged:: 3.6 

243THREE_DOT_DIRECTIVE_RE = re.compile(rf"\.\.\. {ALL_DIRECTIVES}::") 

244 

245# Find role used with double backticks instead of simple backticks like: 

246# :const:``None`` 

247# instead of: 

248# :const:`None` 

249DOUBLE_BACKTICK_ROLE_RE = re.compile(rf"(?<!``){ROLE_HEAD}``") 

250 

251START_STRING_PREFIX = f"(^|(?<=\\s|[{OPENERS}{DELIMITERS}|]))" 

252END_STRING_SUFFIX = f"($|(?=\\s|[\x00{CLOSING_DELIMITERS}{DELIMITERS}{CLOSERS}|]))" 

253 

254# Find role glued with another word like: 

255# the:c:func:`PyThreadState_LeaveTracing` function. 

256# instead of: 

257# the :c:func:`PyThreadState_LeaveTracing` function. 

258# 

259# Also finds roles missing their leading colon like: 

260# issue:`123` 

261# instead of: 

262# :issue:`123` 

263 

264ROLE_GLUED_WITH_WORD_RE = re.compile(rf"(^|\s)(?<!:){SIMPLENAME}:`(?!`)") 

265 

266ROLE_WITH_NO_BACKTICKS_RE = re.compile(rf"(^|\s):{SIMPLENAME}:(?![`:])[^\s`]+(\s|$)") 

267 

268# Find role missing middle colon, like: 

269# The :issue`123` is ... 

270ROLE_MISSING_RIGHT_COLON_RE = re.compile(rf"(^|\s):{SIMPLENAME}`(?!`)") 

271 

272 

273SEEMS_HYPERLINK_RE = re.compile(r"`[^`]+?(\s?)<https?://[^`]+>`(_?)") 

274 

275LEAKED_MARKUP_RE = re.compile(r"[a-z]::\s|`|\.\.\s*\w+:") 

276 

277TRIPLE_BACKTICKS_RE = re.compile( 

278 rf"(?:{START_STRING_PREFIX})```[^`]+?(?<!{START_STRING_PREFIX})```(?:{END_STRING_SUFFIX})" 

279) 

280 

281ROLE_MISSING_CLOSING_BACKTICK_RE = re.compile(rf"({ROLE_HEAD}`[^`]+?)[^`]*$") 

282 

283 

284TABLE_HEAD_RE = re.compile(r"^\+[+=-]+\+") 

285 

286 

287def line_looks_like_a_table(line): 

288 """Return true if the given line looks part of an rst table.""" 

289 line = line.strip() 

290 if TABLE_HEAD_RE.match(line): 

291 return True 

292 return line.startswith("|") and line.endswith("|") 

293 

294 

295def paragraph_looks_like_a_table(paragraph): 

296 """Return true if the given paragraph looks like an rst table.""" 

297 return all(line_looks_like_a_table(line) for line in paragraph.splitlines())