Coverage for fpdf2_textindex / parser.py: 92.06%
280 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 15:45 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 15:45 +0000
1"""Text Index Parser."""
3from collections.abc import Iterable, Iterator
4import itertools
5import logging
6import re
7from typing import Final, TYPE_CHECKING
9from fpdf2_textindex import constants as const
10from fpdf2_textindex.alias import AliasRegistry
11from fpdf2_textindex.constants import LOGGER
12from fpdf2_textindex.interface import Alias
13from fpdf2_textindex.interface import CrossReferenceType
14from fpdf2_textindex.interface import LinkLocation
15from fpdf2_textindex.interface import TextIndexEntry
16from fpdf2_textindex.md_emphasis import MDEmphasis
17from fpdf2_textindex.utils import insert_at_match
18from fpdf2_textindex.utils import join_label_path
19from fpdf2_textindex.utils import remove_match_from_str
20from fpdf2_textindex.utils import remove_quotes
21from fpdf2_textindex.utils import split_label_path
24class TextIndexParser:
25 """Text Index Parser.
27 Parses text(s), finds text index directives, creates the corresponding
28 entries and replaces the directives by corresponding markdown links.
29 """
31 _LEADING_BRACKET_SPAN: Final[str] = (
32 r"(?<!\\)\[(?P<leading_bracket_span>[^\]<>]+)(?<!\\)\]"
33 )
34 _LEADING_NON_WHITESPACE_SPAN: Final[str] = (
35 r"(?P<leading_non_whitespace_span>[^\s\[\]\{\}<>]+?)"
36 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_center'):s}"
37 )
38 _PARAMS: Final[str] = r"\{\^(?P<params>[^\}<\n]*)\}"
39 _DIRECTIVE_PATTERN: re.Pattern[str] = re.compile(
40 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_start'):s}"
41 rf"(?:{_LEADING_NON_WHITESPACE_SPAN:s}|{_LEADING_BRACKET_SPAN:s})?"
42 rf"(?<!>){_PARAMS:s}"
43 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_end'):s}"
44 )
46 _CROSS_REF_IN_PARAMS_PATTERN: re.Pattern[str] = re.compile(r"\|(.+)$")
47 _LABEL_PATH_IN_PARAMS_PATTERN: re.Pattern[str] = re.compile(
48 rf"^((?:[^\|\[~]|{MDEmphasis.STRIKETHROUGH.marker:s})+)"
49 )
50 _SEARCH_WILDCARD_PATTERN: re.Pattern[str] = re.compile(r"\*\^(\-?)")
51 _SORT_KEY_IN_PARAMS_PATTERN: re.Pattern[str] = re.compile(
52 r"\s*\~(['\"]?)(.+)\1$"
53 )
54 _SUFFIX_IN_PARAMS_PATTERN: re.Pattern[str] = re.compile(
55 r"\s*\[(?P<suffix>(?:[^\]\"]+|\"[^\"]+\")+)(?<!\\)\]\s*"
56 )
58 def __init__(
59 self,
60 *,
61 strict: bool = True,
62 ) -> None:
63 """Initializes the parser.
65 Args:
66 strict: If `True` and an entry will have a normal reference
67 (locator) and a SEE-cross reference, a `ValueError` will be
68 raised. Else, it will just be a warning and the SEE-cross
69 reference will be automatically converted to SEE ALSO.
70 Defaults to `True`.
71 """
72 self._alias_reg = AliasRegistry()
73 self._enabled = True
74 self._link_locations: dict[str, LinkLocation] = {}
75 self._directive_id = -1
76 self._root = TextIndexEntry(label="root")
77 self._strict = bool(strict)
79 def __iter__(self) -> Iterator[TextIndexEntry]:
80 yield from itertools.islice(iter(self._root), 1, None)
82 def __len__(self) -> int:
83 return sum(1 for _ in self)
85 def __repr__(self) -> str:
86 return f"{type(self).__name__:s}({len(self):d} entries)"
88 @property
89 def aliases(self) -> list[Alias]:
90 """The parsed aliases."""
91 return list(self._alias_reg.values())
93 @property
94 def entries(self) -> list[TextIndexEntry]:
95 """The parsed entries."""
96 return list(iter(self))
98 @property
99 def last_directive_id(self) -> int:
100 """Last directive id."""
101 return self._directive_id
103 @property
104 def last_index_id(self) -> str:
105 """Last index id."""
106 return f"{const.INDEX_ID_PREFIX:s}{self._directive_id:d}"
108 def entry_at_label_path(
109 self,
110 label_path: Iterable[str],
111 *,
112 create: bool = False,
113 ) -> tuple[TextIndexEntry | None, bool]:
114 """Returns an entry by its label path.
116 If `create=True` and the entry does not exist, it will be created.
118 Args:
119 label_path: The label path.
120 create: Whether to create the entry if it does not exist already.
121 Defaults to `False`.
123 Returns:
124 The found :py:class:`fpdf2_textindex.TextIndexEntry` or `None` and
125 whether the entry has existed before.
126 """
127 created = False
128 node = self._root
129 for label in label_path:
130 child = node.get_child(label)
131 if child is None:
132 if not create: 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true
133 LOGGER.warning("Failed to find %r", label)
134 return None, False
135 LOGGER.debug(
136 "Making new entry %r (%s)",
137 label,
138 f"within {node.label!r:s}" if node.parent else "at root",
139 )
140 child = TextIndexEntry(label=label, parent=node)
141 created = True
142 node = child
143 return node, not created
145 def parse_text(self, text: str) -> str:
146 """Parses a text, finds text index directives, creates the corresponding
147 entries and replaces the directives by corresponding markdown links.
149 Args:
150 text: The text to parse.
152 Returns:
153 The parsed text.
155 Raises:
156 RuntimeError: If a directive cannot be parsed.
157 ValueError: If the label cannot be identified correctly.
158 If `strict=True` and and adding a SEE-cross reference to an
159 entry with a former "normal" reference (locator) or viceversa.
160 """ # noqa: DOC502
161 LOGGER.info("Parsing text by index parser")
163 former_len = len(self)
164 offset = 0 # Account for replacements
166 for directive in self._DIRECTIVE_PATTERN.finditer(text):
167 # Parse and encapsulate each entry, either as object or range-end
168 LOGGER.debug("Directive found: %r", directive.group(0))
169 params = directive.group("params").strip()
171 params, toggling, status_toggled = self._parse_toggling_directive(
172 params
173 )
174 if toggling and (self._enabled or status_toggled):
175 # This was a toggling mark, and we are either now enabled or we
176 # were when we encountered it, remove the mark.
177 text = remove_match_from_str(text, directive, offset=offset)
178 offset += -len(directive.group(0))
179 continue
180 if not toggling and not self._enabled:
181 LOGGER.debug(
182 "Disabled, ignoring directive: %r", directive.group(0)
183 )
184 continue
186 self._directive_id += 1
187 label, content = self._parse_label(directive)
189 params, closing, locator_emphasis = self._parse_final_marker(params)
190 params, label_path, label, unreferenced_alias = (
191 self._parse_label_path(
192 params, label, content, directive.group(0)
193 )
194 )
195 # Found unreferenced alias
196 if unreferenced_alias:
197 LOGGER.log(
198 logging.INFO if label else logging.WARNING,
199 "\tUnreferenced alias %s; skipping rest of directive: %r",
200 "created" if label else "definition without a label",
201 directive.group(0),
202 )
203 # Replace directive in text
204 text = insert_at_match(text, directive, content, offset=offset)
205 offset += len(content) - len(directive.group(0))
206 self._directive_id -= 1
207 continue
209 LOGGER.debug("\tLabel path: %s", label_path)
210 LOGGER.debug("\tLabel: %r", label)
211 if not label:
212 LOGGER.warning(
213 "No entry label specified in directive, ignoring: %r",
214 directive.group(0),
215 )
216 self._directive_id -= 1
217 continue
219 params, suffix = self._parse_suffix(params)
220 params, sort_key = self._parse_sort_key(params, content)
221 params, create_ref, cross_references = self._parse_cross_ref(
222 params, label_path, label, content
223 )
224 if params.strip(): 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true
225 msg = f"Unparsed directive content: {params!r:s}"
226 LOGGER.error(msg)
227 raise RuntimeError(msg)
229 # Insert into entries tree
230 replace_directive = self._update_index(
231 label_path,
232 label,
233 create_ref,
234 cross_references,
235 closing,
236 directive.group(0),
237 locator_emphasis,
238 sort_key,
239 suffix,
240 )
241 if not replace_directive: 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true
242 self._directive_id -= 1
243 continue
245 # Replace directive in text with suitable link
246 link = self._create_link(content)
247 text = insert_at_match(text, directive, link, offset=offset)
248 offset += len(link) - len(directive.group(0))
250 LOGGER.info("Parsed text: %d entries created", len(self) - former_len)
251 LOGGER.debug(
252 "Created text: %r",
253 text if len(text) < 60 else text[:30] + "..." + text[-30:],
254 )
255 return text
257 def _create_link(self, content: str) -> str:
258 unstyled_label, label_emphasis = MDEmphasis.parse(content)
259 return label_emphasis.format(
260 f"[{unstyled_label:s}](#{self.last_index_id:s})"
261 )
263 def _parse_cross_ref(
264 self,
265 params: str,
266 label_path: Iterable[str],
267 label: str,
268 content: str,
269 ) -> tuple[str, bool, list[tuple[CrossReferenceType, list[str]]]]:
270 create_ref = True
271 cross_references: list[tuple[CrossReferenceType, list[str]]] = []
272 params = params.strip()
273 cross_match = self._CROSS_REF_IN_PARAMS_PATTERN.match(params)
274 if cross_match is None:
275 return params, create_ref, cross_references
277 refs_string = cross_match.group(1).strip()
279 # Process aliases before splitting path
280 refs_string = self._alias_reg.replace_aliases(refs_string)
282 # Handle wildcards in cross references
283 refs_string = self._parse_wildcards(refs_string, content)
285 refs = refs_string.split(const.REFS_DELIMITER)
286 for ref in refs:
287 ref = ref.strip()
289 inbound = ref.startswith(const.INBOUND_MARKER)
290 if inbound:
291 ref = ref[len(const.INBOUND_MARKER) :]
293 ref_type = (
294 CrossReferenceType.ALSO
295 if ref.startswith(const.ALSO_MARKER)
296 else CrossReferenceType.SEE
297 )
298 if ref_type == CrossReferenceType.ALSO:
299 ref = ref[len(const.ALSO_MARKER) :]
300 elif not inbound:
301 # Do not create a (page-) reference for this mark's entry if
302 # there is a SEE-cross reference.
303 create_ref = False
305 # Split reference label path
306 ref_label_path = split_label_path(ref)
308 # Cross reference in different entry, referencing this mark's entry
309 if inbound:
310 source_entry, _ = self.entry_at_label_path(
311 ref_label_path, create=True
312 )
313 if TYPE_CHECKING:
314 assert isinstance(source_entry, TextIndexEntry)
315 LOGGER.debug(
316 "\tCreating inbound %s cross reference from entry %r (%s)",
317 ref_type.upper(),
318 ref_label_path[-1],
319 f"Path: {source_entry.joined_label_path!r:s}"
320 if len(ref_label_path) > 1
321 else "at root",
322 )
323 source_entry.add_cross_reference(
324 self._directive_id,
325 ref_type,
326 [*label_path, label],
327 strict=self._strict,
328 )
330 # Cross reference within this mark's entry
331 else:
332 cross_references.append((ref_type, ref_label_path))
334 params = remove_match_from_str(params, cross_match)
335 if len(cross_references) > 0:
336 LOGGER.debug("\tCross references: %r", cross_references)
337 return params, create_ref, cross_references
339 def _parse_final_marker(self, params: str) -> tuple[str, bool, bool]:
340 params = params.strip()
341 closing = params.endswith(const.CLOSING_MARKER)
342 locator_emphasis = params.endswith(const.EMPHASIS_MARKER)
343 if closing:
344 params = params[: -len(const.CLOSING_MARKER)]
345 LOGGER.debug("\tClosing mark: %r", const.CLOSING_MARKER)
346 elif locator_emphasis:
347 params = params[: -len(const.EMPHASIS_MARKER)]
348 LOGGER.debug("\tLocator Emphasis: %r", const.EMPHASIS_MARKER)
349 return params, closing, locator_emphasis
351 def _parse_label(
352 self,
353 directive: re.Match[str],
354 ) -> tuple[str | None, str]:
355 label = None
356 # Leading bracketed span "[x]{^}"
357 if directive.group("leading_bracket_span"):
358 label = directive.group("leading_bracket_span")
359 if ( 359 ↛ 384line 359 didn't jump to line 384 because the condition on line 359 was always true
360 directive.group("md_start") is not None
361 and directive.group("md_start")
362 == directive.group("md_end")[::-1]
363 ):
364 label = (
365 directive.group("md_start")
366 + label
367 + directive.group("md_end")
368 )
369 # Leading implicit non-whitespace span "X{^}"
370 elif directive.group("leading_non_whitespace_span"):
371 label = directive.group("leading_non_whitespace_span")
372 for end in ("md_center", "md_end"):
373 if (
374 directive.group("md_start") is not None
375 and directive.group("md_start")
376 == directive.group(end)[::-1]
377 ):
378 label = (
379 directive.group("md_start")
380 + label
381 + directive.group(end)
382 )
384 content = label or ""
385 LOGGER.debug("\tContent: %r", content)
386 return label, content
388 def _parse_label_path(
389 self,
390 params: str,
391 label: str | None,
392 content: str,
393 directive_str: str,
394 ) -> tuple[str, list[str], str | None, bool]:
395 label_path: list[str] = []
396 label_path_match = self._LABEL_PATH_IN_PARAMS_PATTERN.match(params)
397 if not label_path_match:
398 return params, label_path, label, False
400 label_path_str = label_path_match.group(0).strip()
402 # Process aliases before splitting path.
403 label_path_str = self._alias_reg.replace_aliases(label_path_str)
405 # Handle wildcards in label path
406 label_path_str = self._parse_wildcards(label_path_str, label)
408 # Having already replaced alias references, check for alias
409 # definition at end of label path
410 label_path_str, alias_name, alias_start = self._alias_reg.strip_alias(
411 label_path_str
412 )
414 # Split label path
415 label_path = split_label_path(label_path_str)
417 # Last item is now the label
418 if label_path[-1] not in {"", const.PATH_DELIMITER}:
419 label = label_path.pop()
420 assert isinstance(label, str)
422 # Remove empty last label
423 if label_path and label_path[-1] == "":
424 label_path.pop()
426 # Assert label
427 if label is None: 427 ↛ 428line 427 didn't jump to line 428 because the condition on line 427 was never true
428 msg = "cannot identify label: %r"
429 LOGGER.error(msg, directive_str)
430 raise ValueError(msg % directive_str)
432 # Trim label path from params.
433 params = remove_match_from_str(params, label_path_match)
435 # Check for alias definition.
436 label_path, label, unreferenced_alias = (
437 self._alias_reg.define_or_replace_from_label_path(
438 label_path,
439 label,
440 content,
441 alias_name,
442 alias_start,
443 directive_str,
444 )
445 )
447 return params, label_path, label, unreferenced_alias
449 def _parse_sort_key(
450 self,
451 params: str,
452 content: str,
453 ) -> tuple[str, str | None]:
454 params = params.strip()
455 match = self._SORT_KEY_IN_PARAMS_PATTERN.search(params)
456 if match is not None:
457 sort_key = match.group(2)
458 # Handle wildcards in sort key
459 sort_key = self._parse_wildcards(
460 sort_key,
461 content,
462 force_label_only=True,
463 )
464 params = remove_match_from_str(params, match)
465 LOGGER.debug("\tSort key: %r", sort_key)
466 return params, sort_key
467 return params, None
469 def _parse_suffix(self, params: str) -> tuple[str, str | None]:
470 params = params.strip()
471 match = self._SUFFIX_IN_PARAMS_PATTERN.search(params)
472 if match is not None:
473 suffix = match.group("suffix")
474 suffix = remove_quotes(suffix)
475 params = remove_match_from_str(params, match)
476 LOGGER.debug("\tSuffix: %r", suffix)
477 return params, suffix
478 return params, None
480 def _parse_toggling_directive(self, params: str) -> tuple[str, bool, bool]:
481 params = params.strip()
482 toggling = params in {const.DISABLE_MARKER, const.ENABLE_MARKER}
483 if not toggling:
484 return params, toggling, False
486 status_toggled = False
487 if params == const.ENABLE_MARKER and not self._enabled:
488 self._enabled = True
489 status_toggled = True
490 LOGGER.info("============ Processing enabled. ============")
491 elif params == const.DISABLE_MARKER and self._enabled: 491 ↛ 495line 491 didn't jump to line 495 because the condition on line 491 was always true
492 self._enabled = False
493 status_toggled = True
494 LOGGER.info("============ Processing disabled. ============")
495 return "", toggling, status_toggled
497 def _parse_wildcards(
498 self,
499 directive_str: str,
500 label: str | None,
501 *,
502 force_label_only: bool = False,
503 ) -> str:
504 if not label:
505 return directive_str
507 found_wildcards = list(
508 self._SEARCH_WILDCARD_PATTERN.finditer(directive_str)
509 )
510 found_item = (
511 self._prefix_search(label) if len(found_wildcards) > 0 else None
512 )
513 if isinstance(found_item, TextIndexEntry):
514 replace_label = f'"{found_item.label:s}"'
515 replace_path = found_item.joined_label_path
516 for found_wildcard in reversed(found_wildcards):
517 label_only = (found_wildcard.group(1) != "") or force_label_only
518 replacement = replace_label if label_only else replace_path
519 directive_str = (
520 directive_str[: found_wildcard.start()]
521 + replacement
522 + directive_str[found_wildcard.end() :]
523 )
524 LOGGER.debug(
525 "\tFound %sprefix match for %r: %r",
526 "(label-only) " if label_only else "",
527 label,
528 replacement,
529 )
530 else:
531 for found_wildcard in reversed(found_wildcards): 531 ↛ 532line 531 didn't jump to line 532 because the loop on line 531 never started
532 directive_str = (
533 directive_str[: found_wildcard.start()]
534 + "*" # Fallback on basic wildcard functionality.
535 + directive_str[found_wildcard.end() :]
536 )
537 unstyled_label = MDEmphasis.parse(label)[0]
538 directive_str = directive_str.replace("**", unstyled_label.lower())
539 directive_str = directive_str.replace("*", unstyled_label)
540 return directive_str
542 def _prefix_search(self, text: str) -> TextIndexEntry | None:
543 for entry in self: 543 ↛ 546line 543 didn't jump to line 546 because the loop on line 543 didn't complete
544 if entry.label.startswith(text):
545 return entry
546 return None
548 def _update_index(
549 self,
550 label_path: Iterable[str],
551 label: str,
552 create_ref: bool,
553 cross_references: list[tuple[CrossReferenceType, list[str]]],
554 closing: bool,
555 directive: str,
556 locator_emphasis: bool,
557 sort_key: str | None,
558 suffix: str | None,
559 ) -> bool:
560 entry, existed = self.entry_at_label_path(
561 [*label_path, label],
562 create=not closing,
563 )
564 if not entry and closing: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true
565 LOGGER.warning(
566 "Attempted to close a non-existent entry %r; ignoring: %r",
567 join_label_path([*label_path, label]),
568 directive,
569 )
570 return False
572 # Entry exists and we are closing its range,
573 if entry and closing:
574 if entry.references: 574 ↛ 593line 574 didn't jump to line 593 because the condition on line 574 was always true
575 # If it already has a closing ID, update it, but warn
576 if entry.references[-1].end_id is not None: 576 ↛ 577line 576 didn't jump to line 577 because the condition on line 576 was never true
577 LOGGER.warning(
578 "Altering existing end-location of reference %r: %r",
579 entry.joined_label_path,
580 directive,
581 )
582 entry.update_latest_reference_end(
583 self._directive_id,
584 end_suffix=suffix,
585 )
586 LOGGER.debug(
587 "\tSet end-location for reference to %r",
588 entry.joined_label_path,
589 )
590 else:
591 # Entry exists, but has no references, so we can't set a
592 # closing id.
593 LOGGER.warning(
594 "Attempted to close non-existent reference for "
595 "existing entry %r; ignoring: %r",
596 entry.joined_label_path,
597 directive,
598 )
599 return True
601 # We now have the correct entry, whether it existed before or not
602 if TYPE_CHECKING:
603 assert isinstance(entry, TextIndexEntry)
604 if create_ref:
605 entry.add_reference(
606 self._directive_id,
607 locator_emphasis=locator_emphasis,
608 start_suffix=suffix,
609 strict=self._strict,
610 )
611 elif suffix or locator_emphasis:
612 LOGGER.warning(
613 "Ignoring suffix/locator emphasis in cross reference: %r",
614 directive,
615 )
617 if sort_key:
618 if entry.sort_key and entry.sort_key != sort_key: 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true
619 LOGGER.warning(
620 "Altering existing sort-key for reference %r: "
621 "before: %r, now: %r, directive: %r",
622 entry.joined_label_path,
623 entry.sort_key,
624 sort_key,
625 directive,
626 )
627 entry.sort_key = sort_key
629 if len(cross_references) > 0:
630 if existed:
631 LOGGER.debug(
632 "\tAdding cross references to existing entry %r",
633 entry.joined_label_path,
634 )
635 for ref_type, ref_label_path in cross_references:
636 entry.add_cross_reference(
637 self._directive_id,
638 ref_type,
639 ref_label_path,
640 strict=self._strict,
641 )
643 return True