1 /* Byte-wise substring search, using the Two-Way algorithm.
2 * Copyright (C) 2008, 2010 Eric Blake
3 * Permission to use, copy, modify, and distribute this software
4 * is freely granted, provided that this notice is preserved.
5 */
6
7
8 /* Before including this file, you need to include <string.h>, and define:
9 RESULT_TYPE A macro that expands to the return type.
10 AVAILABLE(h, h_l, j, n_l) A macro that returns nonzero if there are
11 at least N_L bytes left starting at
12 H[J]. H is 'unsigned char *', H_L, J,
13 and N_L are 'size_t'; H_L is an
14 lvalue. For NUL-terminated searches,
15 H_L can be modified each iteration to
16 avoid having to compute the end of H
17 up front.
18
19 For case-insensitivity, you may optionally define:
20 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
21 characters of P1 and P2 are equal.
22 CANON_ELEMENT(c) A macro that canonicalizes an element
23 right after it has been fetched from
24 one of the two strings. The argument
25 is an 'unsigned char'; the result must
26 be an 'unsigned char' as well.
27
28 This file undefines the macros documented above, and defines
29 LONG_NEEDLE_THRESHOLD.
30 */
31
32 #include <limits.h>
33 #include <stdint.h>
34
35 /* We use the Two-Way string matching algorithm, which guarantees
36 linear complexity with constant space. Additionally, for long
37 needles, we also use a bad character shift table similar to the
38 Boyer-Moore algorithm to achieve improved (potentially sub-linear)
39 performance.
40
41 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
42 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
43 */
44
45 /* Point at which computing a bad-byte shift table is likely to be
46 worthwhile. Small needles should not compute a table, since it
47 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
48 speedup no greater than a factor of NEEDLE_LEN. The larger the
49 needle, the better the potential performance gain. On the other
50 hand, on non-POSIX systems with CHAR_BIT larger than eight, the
51 memory required for the table is prohibitive. */
52 #if CHAR_BIT < 10
53 # define LONG_NEEDLE_THRESHOLD 32U
54 #else
55 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
56 #endif
57
58 #define MAX(a, b) ((a < b) ? (b) : (a))
59
60 #ifndef CANON_ELEMENT
61 # define CANON_ELEMENT(c) c
62 #endif
63 #ifndef CMP_FUNC
64 # define CMP_FUNC memcmp
65 #endif
66
67 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
68 Return the index of the first byte in the right half, and set
69 *PERIOD to the global period of the right half.
70
71 The global period of a string is the smallest index (possibly its
72 length) at which all remaining bytes in the string are repetitions
73 of the prefix (the last repetition may be a subset of the prefix).
74
75 When NEEDLE is factored into two halves, a local period is the
76 length of the smallest word that shares a suffix with the left half
77 and shares a prefix with the right half. All factorizations of a
78 non-empty NEEDLE have a local period of at least 1 and no greater
79 than NEEDLE_LEN.
80
81 A critical factorization has the property that the local period
82 equals the global period. All strings have at least one critical
83 factorization with the left half smaller than the global period.
84
85 Given an ordered alphabet, a critical factorization can be computed
86 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
87 larger of two ordered maximal suffixes. The ordered maximal
88 suffixes are determined by lexicographic comparison of
89 periodicity. */
90 static size_t
critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)91 critical_factorization (const unsigned char *needle, size_t needle_len,
92 size_t *period)
93 {
94 /* Index of last byte of left half, or SIZE_MAX. */
95 size_t max_suffix, max_suffix_rev;
96 size_t j; /* Index into NEEDLE for current candidate suffix. */
97 size_t k; /* Offset into current period. */
98 size_t p; /* Intermediate period. */
99 unsigned char a, b; /* Current comparison bytes. */
100
101 /* Invariants:
102 0 <= j < NEEDLE_LEN - 1
103 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
104 min(max_suffix, max_suffix_rev) < global period of NEEDLE
105 1 <= p <= global period of NEEDLE
106 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
107 1 <= k <= p
108 */
109
110 /* Perform lexicographic search. */
111 max_suffix = SIZE_MAX;
112 j = 0;
113 k = p = 1;
114 while (j + k < needle_len)
115 {
116 a = CANON_ELEMENT (needle[j + k]);
117 b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
118 if (a < b)
119 {
120 /* Suffix is smaller, period is entire prefix so far. */
121 j += k;
122 k = 1;
123 p = j - max_suffix;
124 }
125 else if (a == b)
126 {
127 /* Advance through repetition of the current period. */
128 if (k != p)
129 ++k;
130 else
131 {
132 j += p;
133 k = 1;
134 }
135 }
136 else /* b < a */
137 {
138 /* Suffix is larger, start over from current location. */
139 max_suffix = j++;
140 k = p = 1;
141 }
142 }
143 *period = p;
144
145 /* Perform reverse lexicographic search. */
146 max_suffix_rev = SIZE_MAX;
147 j = 0;
148 k = p = 1;
149 while (j + k < needle_len)
150 {
151 a = CANON_ELEMENT (needle[j + k]);
152 b = CANON_ELEMENT (needle[max_suffix_rev + k]);
153 if (b < a)
154 {
155 /* Suffix is smaller, period is entire prefix so far. */
156 j += k;
157 k = 1;
158 p = j - max_suffix_rev;
159 }
160 else if (a == b)
161 {
162 /* Advance through repetition of the current period. */
163 if (k != p)
164 ++k;
165 else
166 {
167 j += p;
168 k = 1;
169 }
170 }
171 else /* a < b */
172 {
173 /* Suffix is larger, start over from current location. */
174 max_suffix_rev = j++;
175 k = p = 1;
176 }
177 }
178
179 /* Choose the longer suffix. Return the first byte of the right
180 half, rather than the last byte of the left half. */
181 if (max_suffix_rev + 1 < max_suffix + 1)
182 return max_suffix + 1;
183 *period = p;
184 return max_suffix_rev + 1;
185 }
186
187 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
188 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
189 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
190 Performance is guaranteed to be linear, with an initialization cost
191 of 2 * NEEDLE_LEN comparisons.
192
193 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
194 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
195 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
196 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
197 static inline RETURN_TYPE
two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)198 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
199 const unsigned char *needle, size_t needle_len)
200 {
201 size_t i; /* Index into current byte of NEEDLE. */
202 size_t j; /* Index into current window of HAYSTACK. */
203 size_t period; /* The period of the right half of needle. */
204 size_t suffix; /* The index of the right half of needle. */
205
206 /* Factor the needle into two halves, such that the left half is
207 smaller than the global period, and the right half is
208 periodic (with a period as large as NEEDLE_LEN - suffix). */
209 suffix = critical_factorization (needle, needle_len, &period);
210
211 /* Perform the search. Each iteration compares the right half
212 first. */
213 if (CMP_FUNC (needle, needle + period, suffix) == 0)
214 {
215 /* Entire needle is periodic; a mismatch can only advance by the
216 period, so use memory to avoid rescanning known occurrences
217 of the period. */
218 size_t memory = 0;
219 j = 0;
220 while (AVAILABLE (haystack, haystack_len, j, needle_len))
221 {
222 /* Scan for matches in right half. */
223 i = MAX (suffix, memory);
224 while (i < needle_len && (CANON_ELEMENT (needle[i])
225 == CANON_ELEMENT (haystack[i + j])))
226 ++i;
227 if (needle_len <= i)
228 {
229 /* Scan for matches in left half. */
230 i = suffix - 1;
231 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
232 == CANON_ELEMENT (haystack[i + j])))
233 --i;
234 if (i + 1 < memory + 1)
235 return (RETURN_TYPE) (haystack + j);
236 /* No match, so remember how many repetitions of period
237 on the right half were scanned. */
238 j += period;
239 memory = needle_len - period;
240 }
241 else
242 {
243 j += i - suffix + 1;
244 memory = 0;
245 }
246 }
247 }
248 else
249 {
250 /* The two halves of needle are distinct; no extra memory is
251 required, and any mismatch results in a maximal shift. */
252 period = MAX (suffix, needle_len - suffix) + 1;
253 j = 0;
254 while (AVAILABLE (haystack, haystack_len, j, needle_len))
255 {
256 /* Scan for matches in right half. */
257 i = suffix;
258 while (i < needle_len && (CANON_ELEMENT (needle[i])
259 == CANON_ELEMENT (haystack[i + j])))
260 ++i;
261 if (needle_len <= i)
262 {
263 /* Scan for matches in left half. */
264 i = suffix - 1;
265 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
266 == CANON_ELEMENT (haystack[i + j])))
267 --i;
268 if (i == SIZE_MAX)
269 return (RETURN_TYPE) (haystack + j);
270 j += period;
271 }
272 else
273 j += i - suffix + 1;
274 }
275 }
276 return NULL;
277 }
278
279 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
280 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
281 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
282 Performance is guaranteed to be linear, with an initialization cost
283 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
284
285 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
286 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
287 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
288 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
289 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
290 sublinear performance is not possible. */
291 _NOINLINE_STATIC RETURN_TYPE __attribute__ ((__used__))
two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)292 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
293 const unsigned char *needle, size_t needle_len)
294 {
295 size_t i; /* Index into current byte of NEEDLE. */
296 size_t j; /* Index into current window of HAYSTACK. */
297 size_t period; /* The period of the right half of needle. */
298 size_t suffix; /* The index of the right half of needle. */
299 size_t shift_table[1U << CHAR_BIT]; /* See below. */
300
301 /* Factor the needle into two halves, such that the left half is
302 smaller than the global period, and the right half is
303 periodic (with a period as large as NEEDLE_LEN - suffix). */
304 suffix = critical_factorization (needle, needle_len, &period);
305
306 /* Populate shift_table. For each possible byte value c,
307 shift_table[c] is the distance from the last occurrence of c to
308 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
309 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
310 for (i = 0; i < 1U << CHAR_BIT; i++)
311 shift_table[i] = needle_len;
312 for (i = 0; i < needle_len; i++)
313 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
314
315 /* Perform the search. Each iteration compares the right half
316 first. */
317 if (CMP_FUNC (needle, needle + period, suffix) == 0)
318 {
319 /* Entire needle is periodic; a mismatch can only advance by the
320 period, so use memory to avoid rescanning known occurrences
321 of the period. */
322 size_t memory = 0;
323 size_t shift;
324 j = 0;
325 while (AVAILABLE (haystack, haystack_len, j, needle_len))
326 {
327 /* Check the last byte first; if it does not match, then
328 shift to the next possible match location. */
329 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
330 if (0 < shift)
331 {
332 if (memory && shift < period)
333 {
334 /* Since needle is periodic, but the last period has
335 a byte out of place, there can be no match until
336 after the mismatch. */
337 shift = needle_len - period;
338 }
339 memory = 0;
340 j += shift;
341 continue;
342 }
343 /* Scan for matches in right half. The last byte has
344 already been matched, by virtue of the shift table. */
345 i = MAX (suffix, memory);
346 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
347 == CANON_ELEMENT (haystack[i + j])))
348 ++i;
349 if (needle_len - 1 <= i)
350 {
351 /* Scan for matches in left half. */
352 i = suffix - 1;
353 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
354 == CANON_ELEMENT (haystack[i + j])))
355 --i;
356 if (i + 1 < memory + 1)
357 return (RETURN_TYPE) (haystack + j);
358 /* No match, so remember how many repetitions of period
359 on the right half were scanned. */
360 j += period;
361 memory = needle_len - period;
362 }
363 else
364 {
365 j += i - suffix + 1;
366 memory = 0;
367 }
368 }
369 }
370 else
371 {
372 /* The two halves of needle are distinct; no extra memory is
373 required, and any mismatch results in a maximal shift. */
374 size_t shift;
375 period = MAX (suffix, needle_len - suffix) + 1;
376 j = 0;
377 while (AVAILABLE (haystack, haystack_len, j, needle_len))
378 {
379 /* Check the last byte first; if it does not match, then
380 shift to the next possible match location. */
381 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
382 if (0 < shift)
383 {
384 j += shift;
385 continue;
386 }
387 /* Scan for matches in right half. The last byte has
388 already been matched, by virtue of the shift table. */
389 i = suffix;
390 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
391 == CANON_ELEMENT (haystack[i + j])))
392 ++i;
393 if (needle_len - 1 <= i)
394 {
395 /* Scan for matches in left half. */
396 i = suffix - 1;
397 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
398 == CANON_ELEMENT (haystack[i + j])))
399 --i;
400 if (i == SIZE_MAX)
401 return (RETURN_TYPE) (haystack + j);
402 j += period;
403 }
404 else
405 j += i - suffix + 1;
406 }
407 }
408 return NULL;
409 }
410
411 #undef AVAILABLE
412 #undef CANON_ELEMENT
413 #undef CMP_FUNC
414 #undef MAX
415 #undef RETURN_TYPE
416