Coverage for fpdf2_textindex / parser.py: 92.06%

280 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-24 15:45 +0000

1"""Text Index Parser.""" 

2 

3from collections.abc import Iterable, Iterator 

4import itertools 

5import logging 

6import re 

7from typing import Final, TYPE_CHECKING 

8 

9from fpdf2_textindex import constants as const 

10from fpdf2_textindex.alias import AliasRegistry 

11from fpdf2_textindex.constants import LOGGER 

12from fpdf2_textindex.interface import Alias 

13from fpdf2_textindex.interface import CrossReferenceType 

14from fpdf2_textindex.interface import LinkLocation 

15from fpdf2_textindex.interface import TextIndexEntry 

16from fpdf2_textindex.md_emphasis import MDEmphasis 

17from fpdf2_textindex.utils import insert_at_match 

18from fpdf2_textindex.utils import join_label_path 

19from fpdf2_textindex.utils import remove_match_from_str 

20from fpdf2_textindex.utils import remove_quotes 

21from fpdf2_textindex.utils import split_label_path 

22 

23 

24class TextIndexParser: 

25 """Text Index Parser. 

26 

27 Parses text(s), finds text index directives, creates the corresponding 

28 entries and replaces the directives by corresponding markdown links. 

29 """ 

30 

31 _LEADING_BRACKET_SPAN: Final[str] = ( 

32 r"(?<!\\)\[(?P<leading_bracket_span>[^\]<>]+)(?<!\\)\]" 

33 ) 

34 _LEADING_NON_WHITESPACE_SPAN: Final[str] = ( 

35 r"(?P<leading_non_whitespace_span>[^\s\[\]\{\}<>]+?)" 

36 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_center'):s}" 

37 ) 

38 _PARAMS: Final[str] = r"\{\^(?P<params>[^\}<\n]*)\}" 

39 _DIRECTIVE_PATTERN: re.Pattern[str] = re.compile( 

40 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_start'):s}" 

41 rf"(?:{_LEADING_NON_WHITESPACE_SPAN:s}|{_LEADING_BRACKET_SPAN:s})?" 

42 rf"(?<!>){_PARAMS:s}" 

43 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_end'):s}" 

44 ) 

45 

46 _CROSS_REF_IN_PARAMS_PATTERN: re.Pattern[str] = re.compile(r"\|(.+)$") 

47 _LABEL_PATH_IN_PARAMS_PATTERN: re.Pattern[str] = re.compile( 

48 rf"^((?:[^\|\[~]|{MDEmphasis.STRIKETHROUGH.marker:s})+)" 

49 ) 

50 _SEARCH_WILDCARD_PATTERN: re.Pattern[str] = re.compile(r"\*\^(\-?)") 

51 _SORT_KEY_IN_PARAMS_PATTERN: re.Pattern[str] = re.compile( 

52 r"\s*\~(['\"]?)(.+)\1$" 

53 ) 

54 _SUFFIX_IN_PARAMS_PATTERN: re.Pattern[str] = re.compile( 

55 r"\s*\[(?P<suffix>(?:[^\]\"]+|\"[^\"]+\")+)(?<!\\)\]\s*" 

56 ) 

57 

58 def __init__( 

59 self, 

60 *, 

61 strict: bool = True, 

62 ) -> None: 

63 """Initializes the parser. 

64 

65 Args: 

66 strict: If `True` and an entry will have a normal reference 

67 (locator) and a SEE-cross reference, a `ValueError` will be 

68 raised. Else, it will just be a warning and the SEE-cross 

69 reference will be automatically converted to SEE ALSO. 

70 Defaults to `True`. 

71 """ 

72 self._alias_reg = AliasRegistry() 

73 self._enabled = True 

74 self._link_locations: dict[str, LinkLocation] = {} 

75 self._directive_id = -1 

76 self._root = TextIndexEntry(label="root") 

77 self._strict = bool(strict) 

78 

79 def __iter__(self) -> Iterator[TextIndexEntry]: 

80 yield from itertools.islice(iter(self._root), 1, None) 

81 

82 def __len__(self) -> int: 

83 return sum(1 for _ in self) 

84 

85 def __repr__(self) -> str: 

86 return f"{type(self).__name__:s}({len(self):d} entries)" 

87 

88 @property 

89 def aliases(self) -> list[Alias]: 

90 """The parsed aliases.""" 

91 return list(self._alias_reg.values()) 

92 

93 @property 

94 def entries(self) -> list[TextIndexEntry]: 

95 """The parsed entries.""" 

96 return list(iter(self)) 

97 

98 @property 

99 def last_directive_id(self) -> int: 

100 """Last directive id.""" 

101 return self._directive_id 

102 

103 @property 

104 def last_index_id(self) -> str: 

105 """Last index id.""" 

106 return f"{const.INDEX_ID_PREFIX:s}{self._directive_id:d}" 

107 

108 def entry_at_label_path( 

109 self, 

110 label_path: Iterable[str], 

111 *, 

112 create: bool = False, 

113 ) -> tuple[TextIndexEntry | None, bool]: 

114 """Returns an entry by its label path. 

115 

116 If `create=True` and the entry does not exist, it will be created. 

117 

118 Args: 

119 label_path: The label path. 

120 create: Whether to create the entry if it does not exist already. 

121 Defaults to `False`. 

122 

123 Returns: 

124 The found :py:class:`fpdf2_textindex.TextIndexEntry` or `None` and 

125 whether the entry has existed before. 

126 """ 

127 created = False 

128 node = self._root 

129 for label in label_path: 

130 child = node.get_child(label) 

131 if child is None: 

132 if not create: 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 LOGGER.warning("Failed to find %r", label) 

134 return None, False 

135 LOGGER.debug( 

136 "Making new entry %r (%s)", 

137 label, 

138 f"within {node.label!r:s}" if node.parent else "at root", 

139 ) 

140 child = TextIndexEntry(label=label, parent=node) 

141 created = True 

142 node = child 

143 return node, not created 

144 

145 def parse_text(self, text: str) -> str: 

146 """Parses a text, finds text index directives, creates the corresponding 

147 entries and replaces the directives by corresponding markdown links. 

148 

149 Args: 

150 text: The text to parse. 

151 

152 Returns: 

153 The parsed text. 

154 

155 Raises: 

156 RuntimeError: If a directive cannot be parsed. 

157 ValueError: If the label cannot be identified correctly. 

158 If `strict=True` and and adding a SEE-cross reference to an 

159 entry with a former "normal" reference (locator) or viceversa. 

160 """ # noqa: DOC502 

161 LOGGER.info("Parsing text by index parser") 

162 

163 former_len = len(self) 

164 offset = 0 # Account for replacements 

165 

166 for directive in self._DIRECTIVE_PATTERN.finditer(text): 

167 # Parse and encapsulate each entry, either as object or range-end 

168 LOGGER.debug("Directive found: %r", directive.group(0)) 

169 params = directive.group("params").strip() 

170 

171 params, toggling, status_toggled = self._parse_toggling_directive( 

172 params 

173 ) 

174 if toggling and (self._enabled or status_toggled): 

175 # This was a toggling mark, and we are either now enabled or we 

176 # were when we encountered it, remove the mark. 

177 text = remove_match_from_str(text, directive, offset=offset) 

178 offset += -len(directive.group(0)) 

179 continue 

180 if not toggling and not self._enabled: 

181 LOGGER.debug( 

182 "Disabled, ignoring directive: %r", directive.group(0) 

183 ) 

184 continue 

185 

186 self._directive_id += 1 

187 label, content = self._parse_label(directive) 

188 

189 params, closing, locator_emphasis = self._parse_final_marker(params) 

190 params, label_path, label, unreferenced_alias = ( 

191 self._parse_label_path( 

192 params, label, content, directive.group(0) 

193 ) 

194 ) 

195 # Found unreferenced alias 

196 if unreferenced_alias: 

197 LOGGER.log( 

198 logging.INFO if label else logging.WARNING, 

199 "\tUnreferenced alias %s; skipping rest of directive: %r", 

200 "created" if label else "definition without a label", 

201 directive.group(0), 

202 ) 

203 # Replace directive in text 

204 text = insert_at_match(text, directive, content, offset=offset) 

205 offset += len(content) - len(directive.group(0)) 

206 self._directive_id -= 1 

207 continue 

208 

209 LOGGER.debug("\tLabel path: %s", label_path) 

210 LOGGER.debug("\tLabel: %r", label) 

211 if not label: 

212 LOGGER.warning( 

213 "No entry label specified in directive, ignoring: %r", 

214 directive.group(0), 

215 ) 

216 self._directive_id -= 1 

217 continue 

218 

219 params, suffix = self._parse_suffix(params) 

220 params, sort_key = self._parse_sort_key(params, content) 

221 params, create_ref, cross_references = self._parse_cross_ref( 

222 params, label_path, label, content 

223 ) 

224 if params.strip(): 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true

225 msg = f"Unparsed directive content: {params!r:s}" 

226 LOGGER.error(msg) 

227 raise RuntimeError(msg) 

228 

229 # Insert into entries tree 

230 replace_directive = self._update_index( 

231 label_path, 

232 label, 

233 create_ref, 

234 cross_references, 

235 closing, 

236 directive.group(0), 

237 locator_emphasis, 

238 sort_key, 

239 suffix, 

240 ) 

241 if not replace_directive: 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true

242 self._directive_id -= 1 

243 continue 

244 

245 # Replace directive in text with suitable link 

246 link = self._create_link(content) 

247 text = insert_at_match(text, directive, link, offset=offset) 

248 offset += len(link) - len(directive.group(0)) 

249 

250 LOGGER.info("Parsed text: %d entries created", len(self) - former_len) 

251 LOGGER.debug( 

252 "Created text: %r", 

253 text if len(text) < 60 else text[:30] + "..." + text[-30:], 

254 ) 

255 return text 

256 

257 def _create_link(self, content: str) -> str: 

258 unstyled_label, label_emphasis = MDEmphasis.parse(content) 

259 return label_emphasis.format( 

260 f"[{unstyled_label:s}](#{self.last_index_id:s})" 

261 ) 

262 

263 def _parse_cross_ref( 

264 self, 

265 params: str, 

266 label_path: Iterable[str], 

267 label: str, 

268 content: str, 

269 ) -> tuple[str, bool, list[tuple[CrossReferenceType, list[str]]]]: 

270 create_ref = True 

271 cross_references: list[tuple[CrossReferenceType, list[str]]] = [] 

272 params = params.strip() 

273 cross_match = self._CROSS_REF_IN_PARAMS_PATTERN.match(params) 

274 if cross_match is None: 

275 return params, create_ref, cross_references 

276 

277 refs_string = cross_match.group(1).strip() 

278 

279 # Process aliases before splitting path 

280 refs_string = self._alias_reg.replace_aliases(refs_string) 

281 

282 # Handle wildcards in cross references 

283 refs_string = self._parse_wildcards(refs_string, content) 

284 

285 refs = refs_string.split(const.REFS_DELIMITER) 

286 for ref in refs: 

287 ref = ref.strip() 

288 

289 inbound = ref.startswith(const.INBOUND_MARKER) 

290 if inbound: 

291 ref = ref[len(const.INBOUND_MARKER) :] 

292 

293 ref_type = ( 

294 CrossReferenceType.ALSO 

295 if ref.startswith(const.ALSO_MARKER) 

296 else CrossReferenceType.SEE 

297 ) 

298 if ref_type == CrossReferenceType.ALSO: 

299 ref = ref[len(const.ALSO_MARKER) :] 

300 elif not inbound: 

301 # Do not create a (page-) reference for this mark's entry if 

302 # there is a SEE-cross reference. 

303 create_ref = False 

304 

305 # Split reference label path 

306 ref_label_path = split_label_path(ref) 

307 

308 # Cross reference in different entry, referencing this mark's entry 

309 if inbound: 

310 source_entry, _ = self.entry_at_label_path( 

311 ref_label_path, create=True 

312 ) 

313 if TYPE_CHECKING: 

314 assert isinstance(source_entry, TextIndexEntry) 

315 LOGGER.debug( 

316 "\tCreating inbound %s cross reference from entry %r (%s)", 

317 ref_type.upper(), 

318 ref_label_path[-1], 

319 f"Path: {source_entry.joined_label_path!r:s}" 

320 if len(ref_label_path) > 1 

321 else "at root", 

322 ) 

323 source_entry.add_cross_reference( 

324 self._directive_id, 

325 ref_type, 

326 [*label_path, label], 

327 strict=self._strict, 

328 ) 

329 

330 # Cross reference within this mark's entry 

331 else: 

332 cross_references.append((ref_type, ref_label_path)) 

333 

334 params = remove_match_from_str(params, cross_match) 

335 if len(cross_references) > 0: 

336 LOGGER.debug("\tCross references: %r", cross_references) 

337 return params, create_ref, cross_references 

338 

339 def _parse_final_marker(self, params: str) -> tuple[str, bool, bool]: 

340 params = params.strip() 

341 closing = params.endswith(const.CLOSING_MARKER) 

342 locator_emphasis = params.endswith(const.EMPHASIS_MARKER) 

343 if closing: 

344 params = params[: -len(const.CLOSING_MARKER)] 

345 LOGGER.debug("\tClosing mark: %r", const.CLOSING_MARKER) 

346 elif locator_emphasis: 

347 params = params[: -len(const.EMPHASIS_MARKER)] 

348 LOGGER.debug("\tLocator Emphasis: %r", const.EMPHASIS_MARKER) 

349 return params, closing, locator_emphasis 

350 

351 def _parse_label( 

352 self, 

353 directive: re.Match[str], 

354 ) -> tuple[str | None, str]: 

355 label = None 

356 # Leading bracketed span "[x]{^}" 

357 if directive.group("leading_bracket_span"): 

358 label = directive.group("leading_bracket_span") 

359 if ( 359 ↛ 384line 359 didn't jump to line 384 because the condition on line 359 was always true

360 directive.group("md_start") is not None 

361 and directive.group("md_start") 

362 == directive.group("md_end")[::-1] 

363 ): 

364 label = ( 

365 directive.group("md_start") 

366 + label 

367 + directive.group("md_end") 

368 ) 

369 # Leading implicit non-whitespace span "X{^}" 

370 elif directive.group("leading_non_whitespace_span"): 

371 label = directive.group("leading_non_whitespace_span") 

372 for end in ("md_center", "md_end"): 

373 if ( 

374 directive.group("md_start") is not None 

375 and directive.group("md_start") 

376 == directive.group(end)[::-1] 

377 ): 

378 label = ( 

379 directive.group("md_start") 

380 + label 

381 + directive.group(end) 

382 ) 

383 

384 content = label or "" 

385 LOGGER.debug("\tContent: %r", content) 

386 return label, content 

387 

388 def _parse_label_path( 

389 self, 

390 params: str, 

391 label: str | None, 

392 content: str, 

393 directive_str: str, 

394 ) -> tuple[str, list[str], str | None, bool]: 

395 label_path: list[str] = [] 

396 label_path_match = self._LABEL_PATH_IN_PARAMS_PATTERN.match(params) 

397 if not label_path_match: 

398 return params, label_path, label, False 

399 

400 label_path_str = label_path_match.group(0).strip() 

401 

402 # Process aliases before splitting path. 

403 label_path_str = self._alias_reg.replace_aliases(label_path_str) 

404 

405 # Handle wildcards in label path 

406 label_path_str = self._parse_wildcards(label_path_str, label) 

407 

408 # Having already replaced alias references, check for alias 

409 # definition at end of label path 

410 label_path_str, alias_name, alias_start = self._alias_reg.strip_alias( 

411 label_path_str 

412 ) 

413 

414 # Split label path 

415 label_path = split_label_path(label_path_str) 

416 

417 # Last item is now the label 

418 if label_path[-1] not in {"", const.PATH_DELIMITER}: 

419 label = label_path.pop() 

420 assert isinstance(label, str) 

421 

422 # Remove empty last label 

423 if label_path and label_path[-1] == "": 

424 label_path.pop() 

425 

426 # Assert label 

427 if label is None: 427 ↛ 428line 427 didn't jump to line 428 because the condition on line 427 was never true

428 msg = "cannot identify label: %r" 

429 LOGGER.error(msg, directive_str) 

430 raise ValueError(msg % directive_str) 

431 

432 # Trim label path from params. 

433 params = remove_match_from_str(params, label_path_match) 

434 

435 # Check for alias definition. 

436 label_path, label, unreferenced_alias = ( 

437 self._alias_reg.define_or_replace_from_label_path( 

438 label_path, 

439 label, 

440 content, 

441 alias_name, 

442 alias_start, 

443 directive_str, 

444 ) 

445 ) 

446 

447 return params, label_path, label, unreferenced_alias 

448 

449 def _parse_sort_key( 

450 self, 

451 params: str, 

452 content: str, 

453 ) -> tuple[str, str | None]: 

454 params = params.strip() 

455 match = self._SORT_KEY_IN_PARAMS_PATTERN.search(params) 

456 if match is not None: 

457 sort_key = match.group(2) 

458 # Handle wildcards in sort key 

459 sort_key = self._parse_wildcards( 

460 sort_key, 

461 content, 

462 force_label_only=True, 

463 ) 

464 params = remove_match_from_str(params, match) 

465 LOGGER.debug("\tSort key: %r", sort_key) 

466 return params, sort_key 

467 return params, None 

468 

469 def _parse_suffix(self, params: str) -> tuple[str, str | None]: 

470 params = params.strip() 

471 match = self._SUFFIX_IN_PARAMS_PATTERN.search(params) 

472 if match is not None: 

473 suffix = match.group("suffix") 

474 suffix = remove_quotes(suffix) 

475 params = remove_match_from_str(params, match) 

476 LOGGER.debug("\tSuffix: %r", suffix) 

477 return params, suffix 

478 return params, None 

479 

480 def _parse_toggling_directive(self, params: str) -> tuple[str, bool, bool]: 

481 params = params.strip() 

482 toggling = params in {const.DISABLE_MARKER, const.ENABLE_MARKER} 

483 if not toggling: 

484 return params, toggling, False 

485 

486 status_toggled = False 

487 if params == const.ENABLE_MARKER and not self._enabled: 

488 self._enabled = True 

489 status_toggled = True 

490 LOGGER.info("============ Processing enabled. ============") 

491 elif params == const.DISABLE_MARKER and self._enabled: 491 ↛ 495line 491 didn't jump to line 495 because the condition on line 491 was always true

492 self._enabled = False 

493 status_toggled = True 

494 LOGGER.info("============ Processing disabled. ============") 

495 return "", toggling, status_toggled 

496 

497 def _parse_wildcards( 

498 self, 

499 directive_str: str, 

500 label: str | None, 

501 *, 

502 force_label_only: bool = False, 

503 ) -> str: 

504 if not label: 

505 return directive_str 

506 

507 found_wildcards = list( 

508 self._SEARCH_WILDCARD_PATTERN.finditer(directive_str) 

509 ) 

510 found_item = ( 

511 self._prefix_search(label) if len(found_wildcards) > 0 else None 

512 ) 

513 if isinstance(found_item, TextIndexEntry): 

514 replace_label = f'"{found_item.label:s}"' 

515 replace_path = found_item.joined_label_path 

516 for found_wildcard in reversed(found_wildcards): 

517 label_only = (found_wildcard.group(1) != "") or force_label_only 

518 replacement = replace_label if label_only else replace_path 

519 directive_str = ( 

520 directive_str[: found_wildcard.start()] 

521 + replacement 

522 + directive_str[found_wildcard.end() :] 

523 ) 

524 LOGGER.debug( 

525 "\tFound %sprefix match for %r: %r", 

526 "(label-only) " if label_only else "", 

527 label, 

528 replacement, 

529 ) 

530 else: 

531 for found_wildcard in reversed(found_wildcards): 531 ↛ 532line 531 didn't jump to line 532 because the loop on line 531 never started

532 directive_str = ( 

533 directive_str[: found_wildcard.start()] 

534 + "*" # Fallback on basic wildcard functionality. 

535 + directive_str[found_wildcard.end() :] 

536 ) 

537 unstyled_label = MDEmphasis.parse(label)[0] 

538 directive_str = directive_str.replace("**", unstyled_label.lower()) 

539 directive_str = directive_str.replace("*", unstyled_label) 

540 return directive_str 

541 

542 def _prefix_search(self, text: str) -> TextIndexEntry | None: 

543 for entry in self: 543 ↛ 546line 543 didn't jump to line 546 because the loop on line 543 didn't complete

544 if entry.label.startswith(text): 

545 return entry 

546 return None 

547 

548 def _update_index( 

549 self, 

550 label_path: Iterable[str], 

551 label: str, 

552 create_ref: bool, 

553 cross_references: list[tuple[CrossReferenceType, list[str]]], 

554 closing: bool, 

555 directive: str, 

556 locator_emphasis: bool, 

557 sort_key: str | None, 

558 suffix: str | None, 

559 ) -> bool: 

560 entry, existed = self.entry_at_label_path( 

561 [*label_path, label], 

562 create=not closing, 

563 ) 

564 if not entry and closing: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true

565 LOGGER.warning( 

566 "Attempted to close a non-existent entry %r; ignoring: %r", 

567 join_label_path([*label_path, label]), 

568 directive, 

569 ) 

570 return False 

571 

572 # Entry exists and we are closing its range, 

573 if entry and closing: 

574 if entry.references: 574 ↛ 593line 574 didn't jump to line 593 because the condition on line 574 was always true

575 # If it already has a closing ID, update it, but warn 

576 if entry.references[-1].end_id is not None: 576 ↛ 577line 576 didn't jump to line 577 because the condition on line 576 was never true

577 LOGGER.warning( 

578 "Altering existing end-location of reference %r: %r", 

579 entry.joined_label_path, 

580 directive, 

581 ) 

582 entry.update_latest_reference_end( 

583 self._directive_id, 

584 end_suffix=suffix, 

585 ) 

586 LOGGER.debug( 

587 "\tSet end-location for reference to %r", 

588 entry.joined_label_path, 

589 ) 

590 else: 

591 # Entry exists, but has no references, so we can't set a 

592 # closing id. 

593 LOGGER.warning( 

594 "Attempted to close non-existent reference for " 

595 "existing entry %r; ignoring: %r", 

596 entry.joined_label_path, 

597 directive, 

598 ) 

599 return True 

600 

601 # We now have the correct entry, whether it existed before or not 

602 if TYPE_CHECKING: 

603 assert isinstance(entry, TextIndexEntry) 

604 if create_ref: 

605 entry.add_reference( 

606 self._directive_id, 

607 locator_emphasis=locator_emphasis, 

608 start_suffix=suffix, 

609 strict=self._strict, 

610 ) 

611 elif suffix or locator_emphasis: 

612 LOGGER.warning( 

613 "Ignoring suffix/locator emphasis in cross reference: %r", 

614 directive, 

615 ) 

616 

617 if sort_key: 

618 if entry.sort_key and entry.sort_key != sort_key: 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true

619 LOGGER.warning( 

620 "Altering existing sort-key for reference %r: " 

621 "before: %r, now: %r, directive: %r", 

622 entry.joined_label_path, 

623 entry.sort_key, 

624 sort_key, 

625 directive, 

626 ) 

627 entry.sort_key = sort_key 

628 

629 if len(cross_references) > 0: 

630 if existed: 

631 LOGGER.debug( 

632 "\tAdding cross references to existing entry %r", 

633 entry.joined_label_path, 

634 ) 

635 for ref_type, ref_label_path in cross_references: 

636 entry.add_cross_reference( 

637 self._directive_id, 

638 ref_type, 

639 ref_label_path, 

640 strict=self._strict, 

641 ) 

642 

643 return True