Coverage for fpdf2_textindex/concordance.py: 84.17%

1"""Concordance List."""

3from collections.abc import Iterable, Sequence

4import os

5import pathlib

6import re

7from typing import Final, TextIO, overload

9from typing_extensions import Self

11from fpdf2_textindex.constants import LOGGER

12from fpdf2_textindex.md_emphasis import MDEmphasis

13from fpdf2_textindex.utils import insert_at_match

16class ConcordanceList(Sequence[tuple[str, str]]):

17 """Concordance List.

19 The class can load a concordance list from a file and parse a text by it,

20 replacing matched terms by text index directives.

21 """

23 _LEADING_BRACKET_SPAN: Final[str] = (

24 r"(?<!\\)\[(?P<leading_bracket_span>[^\]<>]+)(?<!\\)\]"

25 )

26 _LEADING_NON_WHITESPACE_SPAN: Final[str] = (

27 r"(?P<leading_non_whitespace_span>[^\s\[\]\{\}<>]+?)"

28 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_center'):s}"

29 )

30 _PARAMS: Final[str] = r"\{\^(?P<params>[^\}<\n]*)\}"

31 _DIRECTIVE_PATTERN: re.Pattern[str] = re.compile(

32 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_start'):s}"

33 rf"(?:{_LEADING_NON_WHITESPACE_SPAN:s}|{_LEADING_BRACKET_SPAN:s})?"

34 rf"(?<!>){_PARAMS:s}"

35 rf"{MDEmphasis.MARKER_PATTERN.format(name='md_end'):s}"

36 )

37 _EXCLUDE_PATTERN: re.Pattern[str] = re.compile(

38 rf"(?:{_DIRECTIVE_PATTERN.pattern:s})|<.*?>"

39 )

41 def __init__(self, concordance: Iterable[tuple[str, str]]) -> None:

42 self._concordance = tuple(concordance)

44 @overload

45 def __getitem__(self, index: int, /) -> tuple[str, str]: ...

47 @overload

48 def __getitem__(self, index: slice, /) -> Sequence[tuple[str, str]]: ...

50 def __getitem__(

51 self,

52 index: slice | int,

53 ) -> Sequence[tuple[str, str]] | tuple[str, str]:

54 return self._concordance[index]

56 def __len__(self) -> int:

57 return len(self._concordance)

59 def __repr__(self) -> str:

60 return f"{type(self).__name__!r:s}()"

62 @classmethod

63 def from_file(

64 cls,

65 filepath: os.PathLike[str] | str,

66 separator: str = "\t",

67 ) -> Self:

68 r"""Creates a :py:class:`ConcordanceList` from a file.

70 Args:

71 filepath: The filepath.

72 separator: The separator. Defaults to `"\t"`.

74 Returns:

75 The corresponding :py:class:`ConcordanceList`-instance.

77 Raises:

78 OSError: If the concordance file cannot be opened.

79 """

80 filepath = pathlib.Path(filepath)

81 LOGGER.info("Reading file %r", filepath.as_posix())

82 try:

83 with filepath.open(mode="r") as f:

84 concordance = cls._parse_file(f, separator)

85 except OSError as e:

86 LOGGER.error(

87 "cannot open concordance file %r: %s",

88 filepath.as_posix(),

89 e,

90 )

91 raise

92 LOGGER.info("Read %d rules from file", len(concordance))

93 if not concordance: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 LOGGER.warning(

95 "File %r does not comprise rules", filepath.as_posix()

96 )

97 return cls(concordance)

99 @staticmethod

100 def _parse_file(

101 text_io: TextIO,

102 separator: str,

103 ) -> list[tuple[str, str]]:

104 data = []

105 for i, line in enumerate(text_io):

106 line = line.strip()

107 if not line or line.startswith("#"): 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 continue

109

110 line = re.sub(rf"{separator:s}+", separator, line)

111 comp = line.split(separator)

112 if len(comp) == 0: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 continue

114

115 case_sensitive = False

116 if comp[0].startswith("\\="): 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was never true

117 # Since we use '=' as a prefix for case-sensitive,

118 # allow '\=' for literal equals by stripping '\'

119 comp[0] = comp[0][1:]

120 elif comp[0].startswith("="):

121 # Explicitly case-sensitive

122 comp[0] = comp[0][1:]

123 case_sensitive = True

124 elif comp[0] != comp[0].lower(): 124 ↛ 126line 124 didn't jump to line 126 because the condition on line 124 was never true

125 # Implicitly case-sensitive because not all-lowercase

126 case_sensitive = True

127 if not case_sensitive:

128 comp[0] = "(?i)" + comp[0]

129 if len(comp) == 1: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 comp.append("")

131 data.append((comp[0].strip(), comp[1].strip()))

132 LOGGER.debug("\tL%03d %r -> %r", i, *data[-1])

133 return data

134

135 def parse_text(self, text: str) -> str:

136 """Parses a text and replaces found terms from the concordance list by

137 the corresponding directives.

138

139 Args:

140 text: The text to parse.

141

142 Returns:

143 The parsed text.

144 """

145 LOGGER.info("Parsing text by concordance list")

146 excluded_ranges = self._exclude_ranges(text)

147 term_matches = self._match_terms(text, excluded_ranges)

148

149 # Insert suitable index marks.

150 offset = 0

151 for term, replacement in term_matches:

152 mark = f"[{term.group(0):s}]{{^{replacement:s}}}"

153 text = insert_at_match(text, term, mark, offset=offset)

154 offset += len(mark) - len(term.group(0))

155

156 LOGGER.info(

157 "Parsed text by concordance list: %d rules generated %d index "

158 "marks",

159 len(self._concordance),

160 len(term_matches),

161 )

162 return text

163

164 def _exclude_ranges(self, text: str) -> list[tuple[int, int]]:

165 # Parse text for index directive and HTML tag ranges to exclude

166 LOGGER.debug("Excluding text index patterns")

167 excluded_ranges = []

168 for excl in self._EXCLUDE_PATTERN.finditer(text):

169 excluded_ranges.append((excl.start(), excl.end()))

170 LOGGER.debug(

171 "\tExcluded %r at (%d, %d)",

172 excl.group(0),

173 excl.start(),

174 excl.end(),

175 )

176 return excluded_ranges

177

178 def _match_terms(

179 self,

180 text: str,

181 excluded_ranges: list[tuple[int, int]],

182 ) -> list[tuple[re.Match[str], str]]:

183 term_matches = []

184 for pattern, replacement in self:

185 # Match and replace this term expression wherever it does not

186 # intersect excluded ranges

187 new_exclusions = []

188 last_checked = 0

189 LOGGER.info("Matching pattern %r on text", pattern)

190 for term in re.finditer(pattern, text):

191 # Check this is not an excluded range.

192 is_excluded = False

193 for i in range(last_checked, len(excluded_ranges)): 193 ↛ 212line 193 didn't jump to line 212 because the loop on line 193 didn't complete

194 start, end = excluded_ranges[i]

195 if end <= term.start(): # Excluded range ends before term

196 last_checked = i

197 continue

198 elif not ( 198 ↛ 201line 198 didn't jump to line 201 because the condition on line 198 was never true

199 end <= term.start() or term.end() <= start

200 ): # Intersection, abort replacement

201 is_excluded = True

202 LOGGER.debug(

203 "Excluded range %r intersects %r",

204 text[start:end],

205 term.group(0),

206 )

207 break

208 # Excluded range ends after term

209 else: # if start >= term.end():

210 break

211

212 if not is_excluded: 212 ↛ 190line 212 didn't jump to line 190 because the condition on line 212 was always true

213 LOGGER.debug(

214 "\tMatched %r at (%d, %d) in text '...%s...'",

215 pattern,

216 term.start(),

217 term.end(),

218 term.group(0),

219 )

220 term_matches.append((term, replacement))

221 new_exclusions.append((term.start(), term.end()))

222

223 # Exclude found terms for this concordance from future matching

224 excluded_ranges += new_exclusions

225 excluded_ranges.sort(key=lambda e: e[0])

226

227 # Sort all term ranges by order of appearance

228 term_matches.sort(key=lambda tm: tm[0].start())

229 return term_matches

Coverage for fpdf2_textindex / concordance.py: 84.17%

105 statements