Coverage for fpdf2_textindex / concordance.py: 84.17%
105 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 15:45 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 15:45 +0000
1"""Concordance List."""
3from collections.abc import Iterable, Sequence
4import os
5import pathlib
6import re
7from typing import Final, TextIO, overload
9from typing_extensions import Self
11from fpdf2_textindex.constants import LOGGER
12from fpdf2_textindex.md_emphasis import MDEmphasis
13from fpdf2_textindex.utils import insert_at_match
16class ConcordanceList(Sequence[tuple[str, str]]):
17 """Concordance List.
19 The class can load a concordance list from a file and parse a text by it,
20 replacing matched terms by text index directives.
21 """
23 _LEADING_BRACKET_SPAN: Final[str] = (
24 r"(?<!\\)\[(?P<leading_bracket_span>[^\]<>]+)(?<!\\)\]"
25 )
26 _LEADING_NON_WHITESPACE_SPAN: Final[str] = (
27 r"(?P<leading_non_whitespace_span>[^\s\[\]\{\}<>]+?)"
28 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_center'):s}"
29 )
30 _PARAMS: Final[str] = r"\{\^(?P<params>[^\}<\n]*)\}"
31 _DIRECTIVE_PATTERN: re.Pattern[str] = re.compile(
32 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_start'):s}"
33 rf"(?:{_LEADING_NON_WHITESPACE_SPAN:s}|{_LEADING_BRACKET_SPAN:s})?"
34 rf"(?<!>){_PARAMS:s}"
35 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_end'):s}"
36 )
37 _EXCLUDE_PATTERN: re.Pattern[str] = re.compile(
38 rf"(?:{_DIRECTIVE_PATTERN.pattern:s})|<.*?>"
39 )
41 def __init__(self, concordance: Iterable[tuple[str, str]]) -> None:
42 self._concordance = tuple(concordance)
44 @overload
45 def __getitem__(self, index: int, /) -> tuple[str, str]: ...
47 @overload
48 def __getitem__(self, index: slice, /) -> Sequence[tuple[str, str]]: ...
50 def __getitem__(
51 self,
52 index: slice | int,
53 ) -> Sequence[tuple[str, str]] | tuple[str, str]:
54 return self._concordance[index]
56 def __len__(self) -> int:
57 return len(self._concordance)
59 def __repr__(self) -> str:
60 return f"{type(self).__name__!r:s}()"
62 @classmethod
63 def from_file(
64 cls,
65 filepath: os.PathLike[str] | str,
66 separator: str = "\t",
67 ) -> Self:
68 r"""Creates a :py:class:`ConcordanceList` from a file.
70 Args:
71 filepath: The filepath.
72 separator: The separator. Defaults to `"\t"`.
74 Returns:
75 The corresponding :py:class:`ConcordanceList`-instance.
77 Raises:
78 OSError: If the concordance file cannot be opened.
79 """
80 filepath = pathlib.Path(filepath)
81 LOGGER.info("Reading file %r", filepath.as_posix())
82 try:
83 with filepath.open(mode="r") as f:
84 concordance = cls._parse_file(f, separator)
85 except OSError as e:
86 LOGGER.error(
87 "cannot open concordance file %r: %s",
88 filepath.as_posix(),
89 e,
90 )
91 raise
92 LOGGER.info("Read %d rules from file", len(concordance))
93 if not concordance: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 LOGGER.warning(
95 "File %r does not comprise rules", filepath.as_posix()
96 )
97 return cls(concordance)
99 @staticmethod
100 def _parse_file(
101 text_io: TextIO,
102 separator: str,
103 ) -> list[tuple[str, str]]:
104 data = []
105 for i, line in enumerate(text_io):
106 line = line.strip()
107 if not line or line.startswith("#"): 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true
108 continue
110 line = re.sub(rf"{separator:s}+", separator, line)
111 comp = line.split(separator)
112 if len(comp) == 0: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 continue
115 case_sensitive = False
116 if comp[0].startswith("\\="): 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was never true
117 # Since we use '=' as a prefix for case-sensitive,
118 # allow '\=' for literal equals by stripping '\'
119 comp[0] = comp[0][1:]
120 elif comp[0].startswith("="):
121 # Explicitly case-sensitive
122 comp[0] = comp[0][1:]
123 case_sensitive = True
124 elif comp[0] != comp[0].lower(): 124 ↛ 126line 124 didn't jump to line 126 because the condition on line 124 was never true
125 # Implicitly case-sensitive because not all-lowercase
126 case_sensitive = True
127 if not case_sensitive:
128 comp[0] = "(?i)" + comp[0]
129 if len(comp) == 1: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 comp.append("")
131 data.append((comp[0].strip(), comp[1].strip()))
132 LOGGER.debug("\tL%03d %r -> %r", i, *data[-1])
133 return data
135 def parse_text(self, text: str) -> str:
136 """Parses a text and replaces found terms from the concordance list by
137 the corresponding directives.
139 Args:
140 text: The text to parse.
142 Returns:
143 The parsed text.
144 """
145 LOGGER.info("Parsing text by concordance list")
146 excluded_ranges = self._exclude_ranges(text)
147 term_matches = self._match_terms(text, excluded_ranges)
149 # Insert suitable index marks.
150 offset = 0
151 for term, replacement in term_matches:
152 mark = f"[{term.group(0):s}]{{^{replacement:s}}}"
153 text = insert_at_match(text, term, mark, offset=offset)
154 offset += len(mark) - len(term.group(0))
156 LOGGER.info(
157 "Parsed text by concordance list: %d rules generated %d index "
158 "marks",
159 len(self._concordance),
160 len(term_matches),
161 )
162 return text
164 def _exclude_ranges(self, text: str) -> list[tuple[int, int]]:
165 # Parse text for index directive and HTML tag ranges to exclude
166 LOGGER.debug("Excluding text index patterns")
167 excluded_ranges = []
168 for excl in self._EXCLUDE_PATTERN.finditer(text):
169 excluded_ranges.append((excl.start(), excl.end()))
170 LOGGER.debug(
171 "\tExcluded %r at (%d, %d)",
172 excl.group(0),
173 excl.start(),
174 excl.end(),
175 )
176 return excluded_ranges
178 def _match_terms(
179 self,
180 text: str,
181 excluded_ranges: list[tuple[int, int]],
182 ) -> list[tuple[re.Match[str], str]]:
183 term_matches = []
184 for pattern, replacement in self:
185 # Match and replace this term expression wherever it does not
186 # intersect excluded ranges
187 new_exclusions = []
188 last_checked = 0
189 LOGGER.info("Matching pattern %r on text", pattern)
190 for term in re.finditer(pattern, text):
191 # Check this is not an excluded range.
192 is_excluded = False
193 for i in range(last_checked, len(excluded_ranges)): 193 ↛ 212line 193 didn't jump to line 212 because the loop on line 193 didn't complete
194 start, end = excluded_ranges[i]
195 if end <= term.start(): # Excluded range ends before term
196 last_checked = i
197 continue
198 elif not ( 198 ↛ 201line 198 didn't jump to line 201 because the condition on line 198 was never true
199 end <= term.start() or term.end() <= start
200 ): # Intersection, abort replacement
201 is_excluded = True
202 LOGGER.debug(
203 "Excluded range %r intersects %r",
204 text[start:end],
205 term.group(0),
206 )
207 break
208 # Excluded range ends after term
209 else: # if start >= term.end():
210 break
212 if not is_excluded: 212 ↛ 190line 212 didn't jump to line 190 because the condition on line 212 was always true
213 LOGGER.debug(
214 "\tMatched %r at (%d, %d) in text '...%s...'",
215 pattern,
216 term.start(),
217 term.end(),
218 term.group(0),
219 )
220 term_matches.append((term, replacement))
221 new_exclusions.append((term.start(), term.end()))
223 # Exclude found terms for this concordance from future matching
224 excluded_ranges += new_exclusions
225 excluded_ranges.sort(key=lambda e: e[0])
227 # Sort all term ranges by order of appearance
228 term_matches.sort(key=lambda tm: tm[0].start())
229 return term_matches