1 |
|
%% Copyright (c) 2023 Peter Morgan <peter.james.morgan@gmail.com> |
2 |
|
%% |
3 |
|
%% Licensed under the Apache License, Version 2.0 (the "License"); |
4 |
|
%% you may not use this file except in compliance with the License. |
5 |
|
%% You may obtain a copy of the License at |
6 |
|
%% |
7 |
|
%% http://www.apache.org/licenses/LICENSE-2.0 |
8 |
|
%% |
9 |
|
%% Unless required by applicable law or agreed to in writing, software |
10 |
|
%% distributed under the License is distributed on an "AS IS" BASIS, |
11 |
|
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 |
|
%% See the License for the specific language governing permissions and |
13 |
|
%% limitations under the License. |
14 |
|
|
15 |
|
%% @doc Parser combinators that deal with unicode input. |
16 |
|
|
17 |
|
-module(scran_character_complete). |
18 |
|
|
19 |
|
|
20 |
|
-feature(maybe_expr, enable). |
21 |
|
|
22 |
|
|
23 |
|
-export([alpha0/0]). |
24 |
|
-export([alpha1/0]). |
25 |
|
-export([alphanumeric0/0]). |
26 |
|
-export([alphanumeric1/0]). |
27 |
|
-export([digit0/0]). |
28 |
|
-export([digit1/0]). |
29 |
|
-export([hex_digit0/0]). |
30 |
|
-export([hex_digit1/0]). |
31 |
|
-export([multispace0/0]). |
32 |
|
-export([multispace1/0]). |
33 |
|
-export([none_of/1]). |
34 |
|
-export([one_of/1]). |
35 |
|
-export([re/1]). |
36 |
|
-export([re_no_case/1]). |
37 |
|
-export([tag/1]). |
38 |
|
-export([tag_no_case/1]). |
39 |
|
-export([take/1]). |
40 |
|
-include_lib("kernel/include/logger.hrl"). |
41 |
|
|
42 |
|
|
43 |
|
%% @doc Take a number of characters from the input. |
44 |
|
|
45 |
|
-spec take(pos_integer()) -> scran:parser(). |
46 |
|
|
47 |
|
take(N) -> |
48 |
14 |
fun |
49 |
|
(Input) -> |
50 |
14 |
?LOG_DEBUG(#{n => N, input => Input}), |
51 |
14 |
maybe |
52 |
14 |
true ?= string:length(Input) >= N, |
53 |
10 |
{string:slice(Input, N), string:slice(Input, 0, N)} |
54 |
|
else |
55 |
|
false -> |
56 |
4 |
nomatch |
57 |
|
end |
58 |
|
end. |
59 |
|
|
60 |
|
|
61 |
|
%% @doc Return input that matches the supplied case sensitive regular |
62 |
|
%% expression. |
63 |
|
|
64 |
|
-spec re(iodata()) -> scran:parser(). |
65 |
|
|
66 |
|
re(Regex) -> |
67 |
10 |
re(Regex, [anchored]). |
68 |
|
|
69 |
|
|
70 |
|
%% @doc Return input that matches the supplied case insensitive |
71 |
|
%% regular expression. |
72 |
|
|
73 |
|
-spec re_no_case(iodata()) -> scran:parser(). |
74 |
|
|
75 |
|
re_no_case(Regex) -> |
76 |
36 |
re(Regex, [anchored, caseless]). |
77 |
|
|
78 |
|
|
79 |
|
-type compile_option() :: unicode | anchored | caseless | dollar_endonly |
80 |
|
| dotall | extended | firstline | multiline |
81 |
|
| no_auto_capture | dupnames | ungreedy |
82 |
|
| {newline, nl_spec()} |
83 |
|
| bsr_anycrlf | bsr_unicode |
84 |
|
| no_start_optimize | ucp | never_utf. |
85 |
|
|
86 |
|
-type nl_spec() :: cr | crlf | lf | anycrlf | any. |
87 |
|
|
88 |
|
-spec re(iodata(), [compile_option()]) -> scran:parser(). |
89 |
|
|
90 |
|
re(Regex, CompileOptions) -> |
91 |
46 |
{ok, MP} = re:compile(Regex, CompileOptions), |
92 |
46 |
fun |
93 |
|
(Input) -> |
94 |
26 |
?LOG_DEBUG(#{regex => Regex, input => Input}), |
95 |
26 |
maybe |
96 |
26 |
{match, [{Begin, End} | _]} ?= re:run(Input, MP), |
97 |
16 |
{string:slice(Input, End), |
98 |
|
string:slice(Input, Begin, End)} |
99 |
|
end |
100 |
|
end. |
101 |
|
|
102 |
|
|
103 |
|
%% @doc Return the matching case sensistive character data. |
104 |
|
|
105 |
|
-spec tag(unicode:chardata()) -> scran:parser(). |
106 |
|
|
107 |
|
tag(Tag) -> |
108 |
120 |
fun |
109 |
|
(Input) -> |
110 |
130 |
?LOG_DEBUG(#{tag => Tag, input => Input}), |
111 |
130 |
case string:prefix(Input, Tag) of |
112 |
|
nomatch -> |
113 |
43 |
nomatch; |
114 |
|
|
115 |
|
Remainder when is_binary(Input), |
116 |
|
is_list(Tag) -> |
117 |
16 |
{Remainder, list_to_binary(Tag)}; |
118 |
|
|
119 |
|
Remainder -> |
120 |
71 |
{Remainder, Tag} |
121 |
|
end |
122 |
|
end. |
123 |
|
|
124 |
|
|
125 |
|
%% @doc Return the matching case insensistive character data. |
126 |
|
|
127 |
|
-spec tag_no_case(unicode:chardata()) -> scran:parser(). |
128 |
|
|
129 |
|
tag_no_case(Tag) -> |
130 |
6 |
fun |
131 |
|
(Input) -> |
132 |
6 |
?LOG_DEBUG(#{tag => Tag, input => Input}), |
133 |
6 |
case string:prefix( |
134 |
|
string:lowercase(Input), |
135 |
|
string:lowercase(Tag)) of |
136 |
|
|
137 |
|
nomatch -> |
138 |
2 |
nomatch; |
139 |
|
|
140 |
|
_Remainder -> |
141 |
4 |
{string:slice(Input, string:length(Tag)), |
142 |
|
string:slice(Input, 0, string:length(Tag))} |
143 |
|
end |
144 |
|
end. |
145 |
|
|
146 |
|
|
147 |
|
%% @doc Return one of the matching characters. |
148 |
|
|
149 |
|
-spec one_of([unicode:chardata()]) -> scran:parser(). |
150 |
|
|
151 |
|
one_of(Choice) -> |
152 |
33 |
fun |
153 |
|
(Input) -> |
154 |
86 |
?LOG_DEBUG(#{choice => Choice, input => Input}), |
155 |
86 |
maybe |
156 |
86 |
false ?= string:is_empty(Input), |
157 |
70 |
FirstCharacter = string:slice(Input, 0, 1), |
158 |
70 |
true ?= string:find(Choice, FirstCharacter) /= nomatch, |
159 |
64 |
{string:slice(Input, 1), FirstCharacter} |
160 |
|
|
161 |
|
else |
162 |
|
Failed when is_boolean(Failed) -> |
163 |
22 |
nomatch |
164 |
|
end |
165 |
|
end. |
166 |
|
|
167 |
|
|
168 |
|
%% @doc Return the input if it is none of supplied characters. |
169 |
|
|
170 |
|
-spec none_of([unicode:chardata()]) -> scran:parser(). |
171 |
|
|
172 |
|
none_of(Choice) -> |
173 |
5 |
fun |
174 |
|
(Input) -> |
175 |
5 |
?LOG_DEBUG(#{choice => Choice, input => Input}), |
176 |
5 |
maybe |
177 |
5 |
false ?= string:is_empty(Input), |
178 |
4 |
FirstCharacter = string:slice(Input, 0, 1), |
179 |
4 |
true ?= string:find(Choice, FirstCharacter) == nomatch, |
180 |
3 |
{string:slice(Input, 1), FirstCharacter} |
181 |
|
|
182 |
|
else |
183 |
|
Failed when is_boolean(Failed) -> |
184 |
2 |
nomatch |
185 |
|
end |
186 |
|
end. |
187 |
|
|
188 |
|
|
189 |
|
%% @doc Recognizes zero or more lowercase and uppercase ASCII |
190 |
|
%% alphabetic characters: a-z, A-Z. |
191 |
|
|
192 |
|
-spec alpha0() -> scran:parser(). |
193 |
|
|
194 |
|
alpha0() -> |
195 |
4 |
fun |
196 |
|
(Input) -> |
197 |
4 |
(zero_or_more(alpha()))(Input) |
198 |
|
end. |
199 |
|
|
200 |
|
|
201 |
|
%% @doc Recognizes one or more lowercase and uppercase ASCII |
202 |
|
%% alphabetic characters: a-z, A-Z. |
203 |
|
|
204 |
|
-spec alpha1() -> scran:parser(). |
205 |
|
|
206 |
|
alpha1() -> |
207 |
105 |
fun |
208 |
|
(Input) -> |
209 |
111 |
(at_least_one(alpha()))(Input) |
210 |
|
end. |
211 |
|
|
212 |
|
|
213 |
|
%% @doc Recognizes zero or more ASCII numerical and alphabetic |
214 |
|
%% characters: 0-9, a-z, A-Z. |
215 |
|
|
216 |
|
-spec alphanumeric0() -> scran:parser(). |
217 |
|
|
218 |
|
alphanumeric0() -> |
219 |
4 |
fun |
220 |
|
(Input) -> |
221 |
4 |
(zero_or_more(alphanumeric()))(Input) |
222 |
|
end. |
223 |
|
|
224 |
|
|
225 |
|
%% @doc Recognizes one or more ASCII numerical and alphabetic characters: |
226 |
|
%% 0-9, a-z, A-Z. |
227 |
|
|
228 |
|
-spec alphanumeric1() -> scran:parser(). |
229 |
|
|
230 |
|
alphanumeric1() -> |
231 |
4 |
fun |
232 |
|
(Input) -> |
233 |
4 |
(at_least_one(alphanumeric()))(Input) |
234 |
|
end. |
235 |
|
|
236 |
|
|
237 |
|
%% @doc Recognizes zero or more ASCII numerical characters: 0-9. |
238 |
|
|
239 |
|
-spec digit0() -> scran:parser(). |
240 |
|
|
241 |
|
digit0() -> |
242 |
5 |
fun |
243 |
|
(Input) -> |
244 |
5 |
(zero_or_more(numeric()))(Input) |
245 |
|
end. |
246 |
|
|
247 |
|
|
248 |
|
%% @doc Recognizes one or more ASCII numerical characters: 0-9. |
249 |
|
|
250 |
|
-spec digit1() -> scran:parser(). |
251 |
|
|
252 |
|
digit1() -> |
253 |
85 |
fun |
254 |
|
(Input) -> |
255 |
70 |
(at_least_one(numeric()))(Input) |
256 |
|
end. |
257 |
|
|
258 |
|
|
259 |
|
%% @doc Recognizes zero or more spaces, tabs, carriage returns and |
260 |
|
%% line feeds. |
261 |
|
|
262 |
|
-spec multispace0() -> scran:parser(). |
263 |
|
|
264 |
|
multispace0() -> |
265 |
4 |
fun |
266 |
|
(Input) -> |
267 |
4 |
(zero_or_more(whitespace()))(Input) |
268 |
|
end. |
269 |
|
|
270 |
|
|
271 |
|
%% @doc Recognizes zero or more ASCII hexadecimal numerical |
272 |
|
%% characters: 0-9, A-F, a-f. |
273 |
|
|
274 |
|
-spec hex_digit0() -> scran:parser(). |
275 |
|
|
276 |
|
hex_digit0() -> |
277 |
4 |
fun |
278 |
|
(Input) -> |
279 |
4 |
(zero_or_more(hex()))(Input) |
280 |
|
end. |
281 |
|
|
282 |
|
|
283 |
|
%% @doc Recognizes one or more ASCII hexadecimal numerical characters: |
284 |
|
%% 0-9, A-F, a-f. |
285 |
|
|
286 |
|
-spec hex_digit1() -> scran:parser(). |
287 |
|
|
288 |
|
hex_digit1() -> |
289 |
8 |
fun |
290 |
|
(Input) -> |
291 |
8 |
(at_least_one(hex()))(Input) |
292 |
|
end. |
293 |
|
|
294 |
|
|
295 |
|
%% @doc Recognizes one or more spaces, tabs, carriage returns and line |
296 |
|
%% feeds. |
297 |
|
|
298 |
|
-spec multispace1() -> scran:parser(). |
299 |
|
|
300 |
|
multispace1() -> |
301 |
16 |
fun |
302 |
|
(Input) -> |
303 |
7 |
(at_least_one(whitespace()))(Input) |
304 |
|
end. |
305 |
|
|
306 |
|
|
307 |
|
at_least_one(Characters) -> |
308 |
200 |
fun |
309 |
|
(Input) -> |
310 |
200 |
maybe |
311 |
200 |
{_, Matched} = Result = (zero_or_more(Characters))(Input), |
312 |
200 |
true ?= string:length(Matched) >= 1, |
313 |
139 |
Result |
314 |
|
else |
315 |
|
false -> |
316 |
61 |
nomatch |
317 |
|
end |
318 |
|
end. |
319 |
|
|
320 |
|
zero_or_more(Characters) -> |
321 |
221 |
fun |
322 |
|
(Input) -> |
323 |
221 |
flip(string:take(Input, Characters)) |
324 |
|
end. |
325 |
|
|
326 |
|
|
327 |
|
flip(Tuple) -> |
328 |
221 |
list_to_tuple(lists:reverse(tuple_to_list(Tuple))). |
329 |
|
|
330 |
|
|
331 |
|
alpha() -> |
332 |
123 |
lists:seq($a, $z) ++ lists:seq($A, $Z). |
333 |
|
|
334 |
|
|
335 |
|
numeric() -> |
336 |
95 |
lists:seq($0, $9). |
337 |
|
|
338 |
|
|
339 |
|
alphanumeric() -> |
340 |
8 |
alpha() ++ numeric(). |
341 |
|
|
342 |
|
hex() -> |
343 |
12 |
numeric() ++ lists:seq($a, $f) ++ lists:seq($A, $F). |
344 |
|
|
345 |
|
whitespace() -> |
346 |
11 |
"\s\t\n\r". |