1 /* Byte-wise substring search, using the Two-Way algorithm.
2 * Copyright (C) 2008, 2010 Eric Blake
3 * Permission to use, copy, modify, and distribute this software
4 * is freely granted, provided that this notice is preserved.
5 */
6
7
8 /* Before including this file, you need to include <string.h>, and define:
9 RESULT_TYPE A macro that expands to the return type.
10 AVAILABLE(h, h_l, j, n_l) A macro that returns nonzero if there are
11 at least N_L bytes left starting at
12 H[J]. H is 'unsigned char *', H_L, J,
13 and N_L are 'size_t'; H_L is an
14 lvalue. For NUL-terminated searches,
15 H_L can be modified each iteration to
16 avoid having to compute the end of H
17 up front.
18
19 For case-insensitivity, you may optionally define:
20 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
21 characters of P1 and P2 are equal.
22 CANON_ELEMENT(c) A macro that canonicalizes an element
23 right after it has been fetched from
24 one of the two strings. The argument
25 is an 'unsigned char'; the result must
26 be an 'unsigned char' as well.
27
28 This file undefines the macros documented above, and defines
29 LONG_NEEDLE_THRESHOLD.
30 */
31
32 #include <limits.h>
33 #include <stdint.h>
34 #include <_ansi.h>
35
36 /* We use the Two-Way string matching algorithm, which guarantees
37 linear complexity with constant space. Additionally, for long
38 needles, we also use a bad character shift table similar to the
39 Boyer-Moore algorithm to achieve improved (potentially sub-linear)
40 performance.
41
42 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
43 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
44 */
45
46 /* Point at which computing a bad-byte shift table is likely to be
47 worthwhile. Small needles should not compute a table, since it
48 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
49 speedup no greater than a factor of NEEDLE_LEN. The larger the
50 needle, the better the potential performance gain. On the other
51 hand, on non-POSIX systems with CHAR_BIT larger than eight, the
52 memory required for the table is prohibitive. */
53 #if CHAR_BIT < 10
54 # define LONG_NEEDLE_THRESHOLD 32U
55 #else
56 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
57 #endif
58
59 #define MAX(a, b) ((a < b) ? (b) : (a))
60
61 #ifndef CANON_ELEMENT
62 # define CANON_ELEMENT(c) c
63 #endif
64 #ifndef CMP_FUNC
65 # define CMP_FUNC memcmp
66 #endif
67
68 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
69 Return the index of the first byte in the right half, and set
70 *PERIOD to the global period of the right half.
71
72 The global period of a string is the smallest index (possibly its
73 length) at which all remaining bytes in the string are repetitions
74 of the prefix (the last repetition may be a subset of the prefix).
75
76 When NEEDLE is factored into two halves, a local period is the
77 length of the smallest word that shares a suffix with the left half
78 and shares a prefix with the right half. All factorizations of a
79 non-empty NEEDLE have a local period of at least 1 and no greater
80 than NEEDLE_LEN.
81
82 A critical factorization has the property that the local period
83 equals the global period. All strings have at least one critical
84 factorization with the left half smaller than the global period.
85
86 Given an ordered alphabet, a critical factorization can be computed
87 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
88 larger of two ordered maximal suffixes. The ordered maximal
89 suffixes are determined by lexicographic comparison of
90 periodicity. */
91 static size_t
critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)92 critical_factorization (const unsigned char *needle, size_t needle_len,
93 size_t *period)
94 {
95 /* Index of last byte of left half, or SIZE_MAX. */
96 size_t max_suffix, max_suffix_rev;
97 size_t j; /* Index into NEEDLE for current candidate suffix. */
98 size_t k; /* Offset into current period. */
99 size_t p; /* Intermediate period. */
100 unsigned char a, b; /* Current comparison bytes. */
101
102 /* Invariants:
103 0 <= j < NEEDLE_LEN - 1
104 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
105 min(max_suffix, max_suffix_rev) < global period of NEEDLE
106 1 <= p <= global period of NEEDLE
107 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
108 1 <= k <= p
109 */
110
111 /* Perform lexicographic search. */
112 max_suffix = SIZE_MAX;
113 j = 0;
114 k = p = 1;
115 while (j + k < needle_len)
116 {
117 a = CANON_ELEMENT (needle[j + k]);
118 b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
119 if (a < b)
120 {
121 /* Suffix is smaller, period is entire prefix so far. */
122 j += k;
123 k = 1;
124 p = j - max_suffix;
125 }
126 else if (a == b)
127 {
128 /* Advance through repetition of the current period. */
129 if (k != p)
130 ++k;
131 else
132 {
133 j += p;
134 k = 1;
135 }
136 }
137 else /* b < a */
138 {
139 /* Suffix is larger, start over from current location. */
140 max_suffix = j++;
141 k = p = 1;
142 }
143 }
144 *period = p;
145
146 /* Perform reverse lexicographic search. */
147 max_suffix_rev = SIZE_MAX;
148 j = 0;
149 k = p = 1;
150 while (j + k < needle_len)
151 {
152 a = CANON_ELEMENT (needle[j + k]);
153 b = CANON_ELEMENT (needle[max_suffix_rev + k]);
154 if (b < a)
155 {
156 /* Suffix is smaller, period is entire prefix so far. */
157 j += k;
158 k = 1;
159 p = j - max_suffix_rev;
160 }
161 else if (a == b)
162 {
163 /* Advance through repetition of the current period. */
164 if (k != p)
165 ++k;
166 else
167 {
168 j += p;
169 k = 1;
170 }
171 }
172 else /* a < b */
173 {
174 /* Suffix is larger, start over from current location. */
175 max_suffix_rev = j++;
176 k = p = 1;
177 }
178 }
179
180 /* Choose the longer suffix. Return the first byte of the right
181 half, rather than the last byte of the left half. */
182 if (max_suffix_rev + 1 < max_suffix + 1)
183 return max_suffix + 1;
184 *period = p;
185 return max_suffix_rev + 1;
186 }
187
188 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
189 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
190 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
191 Performance is guaranteed to be linear, with an initialization cost
192 of 2 * NEEDLE_LEN comparisons.
193
194 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
195 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
196 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
197 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
198 static inline RETURN_TYPE
two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)199 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
200 const unsigned char *needle, size_t needle_len)
201 {
202 size_t i; /* Index into current byte of NEEDLE. */
203 size_t j; /* Index into current window of HAYSTACK. */
204 size_t period; /* The period of the right half of needle. */
205 size_t suffix; /* The index of the right half of needle. */
206
207 /* Factor the needle into two halves, such that the left half is
208 smaller than the global period, and the right half is
209 periodic (with a period as large as NEEDLE_LEN - suffix). */
210 suffix = critical_factorization (needle, needle_len, &period);
211
212 /* Perform the search. Each iteration compares the right half
213 first. */
214 if (CMP_FUNC (needle, needle + period, suffix) == 0)
215 {
216 /* Entire needle is periodic; a mismatch can only advance by the
217 period, so use memory to avoid rescanning known occurrences
218 of the period. */
219 size_t memory = 0;
220 j = 0;
221 while (AVAILABLE (haystack, haystack_len, j, needle_len))
222 {
223 /* Scan for matches in right half. */
224 i = MAX (suffix, memory);
225 while (i < needle_len && (CANON_ELEMENT (needle[i])
226 == CANON_ELEMENT (haystack[i + j])))
227 ++i;
228 if (needle_len <= i)
229 {
230 /* Scan for matches in left half. */
231 i = suffix - 1;
232 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
233 == CANON_ELEMENT (haystack[i + j])))
234 --i;
235 if (i + 1 < memory + 1)
236 return (RETURN_TYPE) (haystack + j);
237 /* No match, so remember how many repetitions of period
238 on the right half were scanned. */
239 j += period;
240 memory = needle_len - period;
241 }
242 else
243 {
244 j += i - suffix + 1;
245 memory = 0;
246 }
247 }
248 }
249 else
250 {
251 /* The two halves of needle are distinct; no extra memory is
252 required, and any mismatch results in a maximal shift. */
253 period = MAX (suffix, needle_len - suffix) + 1;
254 j = 0;
255 while (AVAILABLE (haystack, haystack_len, j, needle_len))
256 {
257 /* Scan for matches in right half. */
258 i = suffix;
259 while (i < needle_len && (CANON_ELEMENT (needle[i])
260 == CANON_ELEMENT (haystack[i + j])))
261 ++i;
262 if (needle_len <= i)
263 {
264 /* Scan for matches in left half. */
265 i = suffix - 1;
266 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
267 == CANON_ELEMENT (haystack[i + j])))
268 --i;
269 if (i == SIZE_MAX)
270 return (RETURN_TYPE) (haystack + j);
271 j += period;
272 }
273 else
274 j += i - suffix + 1;
275 }
276 }
277 return NULL;
278 }
279
280 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
281 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
282 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
283 Performance is guaranteed to be linear, with an initialization cost
284 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
285
286 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
287 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
288 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
289 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
290 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
291 sublinear performance is not possible. */
292 _NOINLINE_STATIC RETURN_TYPE __attribute__ ((__used__))
two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)293 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
294 const unsigned char *needle, size_t needle_len)
295 {
296 size_t i; /* Index into current byte of NEEDLE. */
297 size_t j; /* Index into current window of HAYSTACK. */
298 size_t period; /* The period of the right half of needle. */
299 size_t suffix; /* The index of the right half of needle. */
300 size_t shift_table[1U << CHAR_BIT]; /* See below. */
301
302 /* Factor the needle into two halves, such that the left half is
303 smaller than the global period, and the right half is
304 periodic (with a period as large as NEEDLE_LEN - suffix). */
305 suffix = critical_factorization (needle, needle_len, &period);
306
307 /* Populate shift_table. For each possible byte value c,
308 shift_table[c] is the distance from the last occurrence of c to
309 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
310 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
311 for (i = 0; i < 1U << CHAR_BIT; i++)
312 shift_table[i] = needle_len;
313 for (i = 0; i < needle_len; i++)
314 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
315
316 /* Perform the search. Each iteration compares the right half
317 first. */
318 if (CMP_FUNC (needle, needle + period, suffix) == 0)
319 {
320 /* Entire needle is periodic; a mismatch can only advance by the
321 period, so use memory to avoid rescanning known occurrences
322 of the period. */
323 size_t memory = 0;
324 size_t shift;
325 j = 0;
326 while (AVAILABLE (haystack, haystack_len, j, needle_len))
327 {
328 /* Check the last byte first; if it does not match, then
329 shift to the next possible match location. */
330 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
331 if (0 < shift)
332 {
333 if (memory && shift < period)
334 {
335 /* Since needle is periodic, but the last period has
336 a byte out of place, there can be no match until
337 after the mismatch. */
338 shift = needle_len - period;
339 }
340 memory = 0;
341 j += shift;
342 continue;
343 }
344 /* Scan for matches in right half. The last byte has
345 already been matched, by virtue of the shift table. */
346 i = MAX (suffix, memory);
347 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
348 == CANON_ELEMENT (haystack[i + j])))
349 ++i;
350 if (needle_len - 1 <= i)
351 {
352 /* Scan for matches in left half. */
353 i = suffix - 1;
354 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
355 == CANON_ELEMENT (haystack[i + j])))
356 --i;
357 if (i + 1 < memory + 1)
358 return (RETURN_TYPE) (haystack + j);
359 /* No match, so remember how many repetitions of period
360 on the right half were scanned. */
361 j += period;
362 memory = needle_len - period;
363 }
364 else
365 {
366 j += i - suffix + 1;
367 memory = 0;
368 }
369 }
370 }
371 else
372 {
373 /* The two halves of needle are distinct; no extra memory is
374 required, and any mismatch results in a maximal shift. */
375 size_t shift;
376 period = MAX (suffix, needle_len - suffix) + 1;
377 j = 0;
378 while (AVAILABLE (haystack, haystack_len, j, needle_len))
379 {
380 /* Check the last byte first; if it does not match, then
381 shift to the next possible match location. */
382 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
383 if (0 < shift)
384 {
385 j += shift;
386 continue;
387 }
388 /* Scan for matches in right half. The last byte has
389 already been matched, by virtue of the shift table. */
390 i = suffix;
391 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
392 == CANON_ELEMENT (haystack[i + j])))
393 ++i;
394 if (needle_len - 1 <= i)
395 {
396 /* Scan for matches in left half. */
397 i = suffix - 1;
398 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
399 == CANON_ELEMENT (haystack[i + j])))
400 --i;
401 if (i == SIZE_MAX)
402 return (RETURN_TYPE) (haystack + j);
403 j += period;
404 }
405 else
406 j += i - suffix + 1;
407 }
408 }
409 return NULL;
410 }
411
412 #undef AVAILABLE
413 #undef CANON_ELEMENT
414 #undef CMP_FUNC
415 #undef MAX
416 #undef RETURN_TYPE
417