Coverage for fpdf2_textindex / concordance.py: 84.17%

105 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-24 15:45 +0000

1"""Concordance List.""" 

2 

3from collections.abc import Iterable, Sequence 

4import os 

5import pathlib 

6import re 

7from typing import Final, TextIO, overload 

8 

9from typing_extensions import Self 

10 

11from fpdf2_textindex.constants import LOGGER 

12from fpdf2_textindex.md_emphasis import MDEmphasis 

13from fpdf2_textindex.utils import insert_at_match 

14 

15 

16class ConcordanceList(Sequence[tuple[str, str]]): 

17 """Concordance List. 

18 

19 The class can load a concordance list from a file and parse a text by it, 

20 replacing matched terms by text index directives. 

21 """ 

22 

23 _LEADING_BRACKET_SPAN: Final[str] = ( 

24 r"(?<!\\)\[(?P<leading_bracket_span>[^\]<>]+)(?<!\\)\]" 

25 ) 

26 _LEADING_NON_WHITESPACE_SPAN: Final[str] = ( 

27 r"(?P<leading_non_whitespace_span>[^\s\[\]\{\}<>]+?)" 

28 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_center'):s}" 

29 ) 

30 _PARAMS: Final[str] = r"\{\^(?P<params>[^\}<\n]*)\}" 

31 _DIRECTIVE_PATTERN: re.Pattern[str] = re.compile( 

32 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_start'):s}" 

33 rf"(?:{_LEADING_NON_WHITESPACE_SPAN:s}|{_LEADING_BRACKET_SPAN:s})?" 

34 rf"(?<!>){_PARAMS:s}" 

35 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_end'):s}" 

36 ) 

37 _EXCLUDE_PATTERN: re.Pattern[str] = re.compile( 

38 rf"(?:{_DIRECTIVE_PATTERN.pattern:s})|<.*?>" 

39 ) 

40 

41 def __init__(self, concordance: Iterable[tuple[str, str]]) -> None: 

42 self._concordance = tuple(concordance) 

43 

44 @overload 

45 def __getitem__(self, index: int, /) -> tuple[str, str]: ... 

46 

47 @overload 

48 def __getitem__(self, index: slice, /) -> Sequence[tuple[str, str]]: ... 

49 

50 def __getitem__( 

51 self, 

52 index: slice | int, 

53 ) -> Sequence[tuple[str, str]] | tuple[str, str]: 

54 return self._concordance[index] 

55 

56 def __len__(self) -> int: 

57 return len(self._concordance) 

58 

59 def __repr__(self) -> str: 

60 return f"{type(self).__name__!r:s}()" 

61 

62 @classmethod 

63 def from_file( 

64 cls, 

65 filepath: os.PathLike[str] | str, 

66 separator: str = "\t", 

67 ) -> Self: 

68 r"""Creates a :py:class:`ConcordanceList` from a file. 

69 

70 Args: 

71 filepath: The filepath. 

72 separator: The separator. Defaults to `"\t"`. 

73 

74 Returns: 

75 The corresponding :py:class:`ConcordanceList`-instance. 

76 

77 Raises: 

78 OSError: If the concordance file cannot be opened. 

79 """ 

80 filepath = pathlib.Path(filepath) 

81 LOGGER.info("Reading file %r", filepath.as_posix()) 

82 try: 

83 with filepath.open(mode="r") as f: 

84 concordance = cls._parse_file(f, separator) 

85 except OSError as e: 

86 LOGGER.error( 

87 "cannot open concordance file %r: %s", 

88 filepath.as_posix(), 

89 e, 

90 ) 

91 raise 

92 LOGGER.info("Read %d rules from file", len(concordance)) 

93 if not concordance: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 LOGGER.warning( 

95 "File %r does not comprise rules", filepath.as_posix() 

96 ) 

97 return cls(concordance) 

98 

99 @staticmethod 

100 def _parse_file( 

101 text_io: TextIO, 

102 separator: str, 

103 ) -> list[tuple[str, str]]: 

104 data = [] 

105 for i, line in enumerate(text_io): 

106 line = line.strip() 

107 if not line or line.startswith("#"): 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 continue 

109 

110 line = re.sub(rf"{separator:s}+", separator, line) 

111 comp = line.split(separator) 

112 if len(comp) == 0: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 continue 

114 

115 case_sensitive = False 

116 if comp[0].startswith("\\="): 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was never true

117 # Since we use '=' as a prefix for case-sensitive, 

118 # allow '\=' for literal equals by stripping '\' 

119 comp[0] = comp[0][1:] 

120 elif comp[0].startswith("="): 

121 # Explicitly case-sensitive 

122 comp[0] = comp[0][1:] 

123 case_sensitive = True 

124 elif comp[0] != comp[0].lower(): 124 ↛ 126line 124 didn't jump to line 126 because the condition on line 124 was never true

125 # Implicitly case-sensitive because not all-lowercase 

126 case_sensitive = True 

127 if not case_sensitive: 

128 comp[0] = "(?i)" + comp[0] 

129 if len(comp) == 1: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 comp.append("") 

131 data.append((comp[0].strip(), comp[1].strip())) 

132 LOGGER.debug("\tL%03d %r -> %r", i, *data[-1]) 

133 return data 

134 

135 def parse_text(self, text: str) -> str: 

136 """Parses a text and replaces found terms from the concordance list by 

137 the corresponding directives. 

138 

139 Args: 

140 text: The text to parse. 

141 

142 Returns: 

143 The parsed text. 

144 """ 

145 LOGGER.info("Parsing text by concordance list") 

146 excluded_ranges = self._exclude_ranges(text) 

147 term_matches = self._match_terms(text, excluded_ranges) 

148 

149 # Insert suitable index marks. 

150 offset = 0 

151 for term, replacement in term_matches: 

152 mark = f"[{term.group(0):s}]{{^{replacement:s}}}" 

153 text = insert_at_match(text, term, mark, offset=offset) 

154 offset += len(mark) - len(term.group(0)) 

155 

156 LOGGER.info( 

157 "Parsed text by concordance list: %d rules generated %d index " 

158 "marks", 

159 len(self._concordance), 

160 len(term_matches), 

161 ) 

162 return text 

163 

164 def _exclude_ranges(self, text: str) -> list[tuple[int, int]]: 

165 # Parse text for index directive and HTML tag ranges to exclude 

166 LOGGER.debug("Excluding text index patterns") 

167 excluded_ranges = [] 

168 for excl in self._EXCLUDE_PATTERN.finditer(text): 

169 excluded_ranges.append((excl.start(), excl.end())) 

170 LOGGER.debug( 

171 "\tExcluded %r at (%d, %d)", 

172 excl.group(0), 

173 excl.start(), 

174 excl.end(), 

175 ) 

176 return excluded_ranges 

177 

178 def _match_terms( 

179 self, 

180 text: str, 

181 excluded_ranges: list[tuple[int, int]], 

182 ) -> list[tuple[re.Match[str], str]]: 

183 term_matches = [] 

184 for pattern, replacement in self: 

185 # Match and replace this term expression wherever it does not 

186 # intersect excluded ranges 

187 new_exclusions = [] 

188 last_checked = 0 

189 LOGGER.info("Matching pattern %r on text", pattern) 

190 for term in re.finditer(pattern, text): 

191 # Check this is not an excluded range. 

192 is_excluded = False 

193 for i in range(last_checked, len(excluded_ranges)): 193 ↛ 212line 193 didn't jump to line 212 because the loop on line 193 didn't complete

194 start, end = excluded_ranges[i] 

195 if end <= term.start(): # Excluded range ends before term 

196 last_checked = i 

197 continue 

198 elif not ( 198 ↛ 201line 198 didn't jump to line 201 because the condition on line 198 was never true

199 end <= term.start() or term.end() <= start 

200 ): # Intersection, abort replacement 

201 is_excluded = True 

202 LOGGER.debug( 

203 "Excluded range %r intersects %r", 

204 text[start:end], 

205 term.group(0), 

206 ) 

207 break 

208 # Excluded range ends after term 

209 else: # if start >= term.end(): 

210 break 

211 

212 if not is_excluded: 212 ↛ 190line 212 didn't jump to line 190 because the condition on line 212 was always true

213 LOGGER.debug( 

214 "\tMatched %r at (%d, %d) in text '...%s...'", 

215 pattern, 

216 term.start(), 

217 term.end(), 

218 term.group(0), 

219 ) 

220 term_matches.append((term, replacement)) 

221 new_exclusions.append((term.start(), term.end())) 

222 

223 # Exclude found terms for this concordance from future matching 

224 excluded_ranges += new_exclusions 

225 excluded_ranges.sort(key=lambda e: e[0]) 

226 

227 # Sort all term ranges by order of appearance 

228 term_matches.sort(key=lambda tm: tm[0].start()) 

229 return term_matches