1 /* Byte-wise substring search, using the Two-Way algorithm.
2  * Copyright (C) 2008, 2010 Eric Blake
3  * Permission to use, copy, modify, and distribute this software
4  * is freely granted, provided that this notice is preserved.
5  */
6 
7 
8 /* Before including this file, you need to include <string.h>, and define:
9      RESULT_TYPE		A macro that expands to the return type.
10      AVAILABLE(h, h_l, j, n_l)	A macro that returns nonzero if there are
11 				at least N_L bytes left starting at
12 				H[J].  H is 'unsigned char *', H_L, J,
13 				and N_L are 'size_t'; H_L is an
14 				lvalue.  For NUL-terminated searches,
15 				H_L can be modified each iteration to
16 				avoid having to compute the end of H
17 				up front.
18 
19   For case-insensitivity, you may optionally define:
20      CMP_FUNC(p1, p2, l)	A macro that returns 0 iff the first L
21 				characters of P1 and P2 are equal.
22      CANON_ELEMENT(c)		A macro that canonicalizes an element
23 				right after it has been fetched from
24 				one of the two strings.  The argument
25 				is an 'unsigned char'; the result must
26 				be an 'unsigned char' as well.
27 
28   This file undefines the macros documented above, and defines
29   LONG_NEEDLE_THRESHOLD.
30 */
31 
32 #include <limits.h>
33 #include <stdint.h>
34 
35 /* We use the Two-Way string matching algorithm, which guarantees
36    linear complexity with constant space.  Additionally, for long
37    needles, we also use a bad character shift table similar to the
38    Boyer-Moore algorithm to achieve improved (potentially sub-linear)
39    performance.
40 
41    See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
42    and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
43 */
44 
45 /* Point at which computing a bad-byte shift table is likely to be
46    worthwhile.  Small needles should not compute a table, since it
47    adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
48    speedup no greater than a factor of NEEDLE_LEN.  The larger the
49    needle, the better the potential performance gain.  On the other
50    hand, on non-POSIX systems with CHAR_BIT larger than eight, the
51    memory required for the table is prohibitive.  */
52 #if CHAR_BIT < 10
53 # define LONG_NEEDLE_THRESHOLD 32U
54 #else
55 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
56 #endif
57 
58 #define MAX(a, b) ((a < b) ? (b) : (a))
59 
60 #ifndef CANON_ELEMENT
61 # define CANON_ELEMENT(c) c
62 #endif
63 #ifndef CMP_FUNC
64 # define CMP_FUNC memcmp
65 #endif
66 
67 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
68    Return the index of the first byte in the right half, and set
69    *PERIOD to the global period of the right half.
70 
71    The global period of a string is the smallest index (possibly its
72    length) at which all remaining bytes in the string are repetitions
73    of the prefix (the last repetition may be a subset of the prefix).
74 
75    When NEEDLE is factored into two halves, a local period is the
76    length of the smallest word that shares a suffix with the left half
77    and shares a prefix with the right half.  All factorizations of a
78    non-empty NEEDLE have a local period of at least 1 and no greater
79    than NEEDLE_LEN.
80 
81    A critical factorization has the property that the local period
82    equals the global period.  All strings have at least one critical
83    factorization with the left half smaller than the global period.
84 
85    Given an ordered alphabet, a critical factorization can be computed
86    in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
87    larger of two ordered maximal suffixes.  The ordered maximal
88    suffixes are determined by lexicographic comparison of
89    periodicity.  */
90 static size_t
critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)91 critical_factorization (const unsigned char *needle, size_t needle_len,
92 			size_t *period)
93 {
94   /* Index of last byte of left half, or SIZE_MAX.  */
95   size_t max_suffix, max_suffix_rev;
96   size_t j; /* Index into NEEDLE for current candidate suffix.  */
97   size_t k; /* Offset into current period.  */
98   size_t p; /* Intermediate period.  */
99   unsigned char a, b; /* Current comparison bytes.  */
100 
101   /* Invariants:
102      0 <= j < NEEDLE_LEN - 1
103      -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
104      min(max_suffix, max_suffix_rev) < global period of NEEDLE
105      1 <= p <= global period of NEEDLE
106      p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
107      1 <= k <= p
108   */
109 
110   /* Perform lexicographic search.  */
111   max_suffix = SIZE_MAX;
112   j = 0;
113   k = p = 1;
114   while (j + k < needle_len)
115     {
116       a = CANON_ELEMENT (needle[j + k]);
117       b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
118       if (a < b)
119 	{
120 	  /* Suffix is smaller, period is entire prefix so far.  */
121 	  j += k;
122 	  k = 1;
123 	  p = j - max_suffix;
124 	}
125       else if (a == b)
126 	{
127 	  /* Advance through repetition of the current period.  */
128 	  if (k != p)
129 	    ++k;
130 	  else
131 	    {
132 	      j += p;
133 	      k = 1;
134 	    }
135 	}
136       else /* b < a */
137 	{
138 	  /* Suffix is larger, start over from current location.  */
139 	  max_suffix = j++;
140 	  k = p = 1;
141 	}
142     }
143   *period = p;
144 
145   /* Perform reverse lexicographic search.  */
146   max_suffix_rev = SIZE_MAX;
147   j = 0;
148   k = p = 1;
149   while (j + k < needle_len)
150     {
151       a = CANON_ELEMENT (needle[j + k]);
152       b = CANON_ELEMENT (needle[max_suffix_rev + k]);
153       if (b < a)
154 	{
155 	  /* Suffix is smaller, period is entire prefix so far.  */
156 	  j += k;
157 	  k = 1;
158 	  p = j - max_suffix_rev;
159 	}
160       else if (a == b)
161 	{
162 	  /* Advance through repetition of the current period.  */
163 	  if (k != p)
164 	    ++k;
165 	  else
166 	    {
167 	      j += p;
168 	      k = 1;
169 	    }
170 	}
171       else /* a < b */
172 	{
173 	  /* Suffix is larger, start over from current location.  */
174 	  max_suffix_rev = j++;
175 	  k = p = 1;
176 	}
177     }
178 
179   /* Choose the longer suffix.  Return the first byte of the right
180      half, rather than the last byte of the left half.  */
181   if (max_suffix_rev + 1 < max_suffix + 1)
182     return max_suffix + 1;
183   *period = p;
184   return max_suffix_rev + 1;
185 }
186 
187 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
188    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
189    method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
190    Performance is guaranteed to be linear, with an initialization cost
191    of 2 * NEEDLE_LEN comparisons.
192 
193    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
194    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
195    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
196    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
197 static inline RETURN_TYPE
two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)198 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
199 		      const unsigned char *needle, size_t needle_len)
200 {
201   size_t i; /* Index into current byte of NEEDLE.  */
202   size_t j; /* Index into current window of HAYSTACK.  */
203   size_t period; /* The period of the right half of needle.  */
204   size_t suffix; /* The index of the right half of needle.  */
205 
206   /* Factor the needle into two halves, such that the left half is
207      smaller than the global period, and the right half is
208      periodic (with a period as large as NEEDLE_LEN - suffix).  */
209   suffix = critical_factorization (needle, needle_len, &period);
210 
211   /* Perform the search.  Each iteration compares the right half
212      first.  */
213   if (CMP_FUNC (needle, needle + period, suffix) == 0)
214     {
215       /* Entire needle is periodic; a mismatch can only advance by the
216 	 period, so use memory to avoid rescanning known occurrences
217 	 of the period.  */
218       size_t memory = 0;
219       j = 0;
220       while (AVAILABLE (haystack, haystack_len, j, needle_len))
221 	{
222 	  /* Scan for matches in right half.  */
223 	  i = MAX (suffix, memory);
224 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
225 				    == CANON_ELEMENT (haystack[i + j])))
226 	    ++i;
227 	  if (needle_len <= i)
228 	    {
229 	      /* Scan for matches in left half.  */
230 	      i = suffix - 1;
231 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
232 					== CANON_ELEMENT (haystack[i + j])))
233 		--i;
234 	      if (i + 1 < memory + 1)
235 		return (RETURN_TYPE) (haystack + j);
236 	      /* No match, so remember how many repetitions of period
237 		 on the right half were scanned.  */
238 	      j += period;
239 	      memory = needle_len - period;
240 	    }
241 	  else
242 	    {
243 	      j += i - suffix + 1;
244 	      memory = 0;
245 	    }
246 	}
247     }
248   else
249     {
250       /* The two halves of needle are distinct; no extra memory is
251 	 required, and any mismatch results in a maximal shift.  */
252       period = MAX (suffix, needle_len - suffix) + 1;
253       j = 0;
254       while (AVAILABLE (haystack, haystack_len, j, needle_len))
255 	{
256 	  /* Scan for matches in right half.  */
257 	  i = suffix;
258 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
259 				    == CANON_ELEMENT (haystack[i + j])))
260 	    ++i;
261 	  if (needle_len <= i)
262 	    {
263 	      /* Scan for matches in left half.  */
264 	      i = suffix - 1;
265 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
266 				       == CANON_ELEMENT (haystack[i + j])))
267 		--i;
268 	      if (i == SIZE_MAX)
269 		return (RETURN_TYPE) (haystack + j);
270 	      j += period;
271 	    }
272 	  else
273 	    j += i - suffix + 1;
274 	}
275     }
276   return NULL;
277 }
278 
279 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
280    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
281    method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
282    Performance is guaranteed to be linear, with an initialization cost
283    of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
284 
285    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
286    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
287    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
288    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
289    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
290    sublinear performance is not possible.  */
291 _NOINLINE_STATIC RETURN_TYPE __attribute__ ((__used__))
two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)292 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
293 		     const unsigned char *needle, size_t needle_len)
294 {
295   size_t i; /* Index into current byte of NEEDLE.  */
296   size_t j; /* Index into current window of HAYSTACK.  */
297   size_t period; /* The period of the right half of needle.  */
298   size_t suffix; /* The index of the right half of needle.  */
299   size_t shift_table[1U << CHAR_BIT]; /* See below.  */
300 
301   /* Factor the needle into two halves, such that the left half is
302      smaller than the global period, and the right half is
303      periodic (with a period as large as NEEDLE_LEN - suffix).  */
304   suffix = critical_factorization (needle, needle_len, &period);
305 
306   /* Populate shift_table.  For each possible byte value c,
307      shift_table[c] is the distance from the last occurrence of c to
308      the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
309      shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
310   for (i = 0; i < 1U << CHAR_BIT; i++)
311     shift_table[i] = needle_len;
312   for (i = 0; i < needle_len; i++)
313     shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
314 
315   /* Perform the search.  Each iteration compares the right half
316      first.  */
317   if (CMP_FUNC (needle, needle + period, suffix) == 0)
318     {
319       /* Entire needle is periodic; a mismatch can only advance by the
320 	 period, so use memory to avoid rescanning known occurrences
321 	 of the period.  */
322       size_t memory = 0;
323       size_t shift;
324       j = 0;
325       while (AVAILABLE (haystack, haystack_len, j, needle_len))
326 	{
327 	  /* Check the last byte first; if it does not match, then
328 	     shift to the next possible match location.  */
329 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
330 	  if (0 < shift)
331 	    {
332 	      if (memory && shift < period)
333 		{
334 		  /* Since needle is periodic, but the last period has
335 		     a byte out of place, there can be no match until
336 		     after the mismatch.  */
337 		  shift = needle_len - period;
338 		}
339 	      memory = 0;
340 	      j += shift;
341 	      continue;
342 	    }
343 	  /* Scan for matches in right half.  The last byte has
344 	     already been matched, by virtue of the shift table.  */
345 	  i = MAX (suffix, memory);
346 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
347 					== CANON_ELEMENT (haystack[i + j])))
348 	    ++i;
349 	  if (needle_len - 1 <= i)
350 	    {
351 	      /* Scan for matches in left half.  */
352 	      i = suffix - 1;
353 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
354 					== CANON_ELEMENT (haystack[i + j])))
355 		--i;
356 	      if (i + 1 < memory + 1)
357 		return (RETURN_TYPE) (haystack + j);
358 	      /* No match, so remember how many repetitions of period
359 		 on the right half were scanned.  */
360 	      j += period;
361 	      memory = needle_len - period;
362 	    }
363 	  else
364 	    {
365 	      j += i - suffix + 1;
366 	      memory = 0;
367 	    }
368 	}
369     }
370   else
371     {
372       /* The two halves of needle are distinct; no extra memory is
373 	 required, and any mismatch results in a maximal shift.  */
374       size_t shift;
375       period = MAX (suffix, needle_len - suffix) + 1;
376       j = 0;
377       while (AVAILABLE (haystack, haystack_len, j, needle_len))
378 	{
379 	  /* Check the last byte first; if it does not match, then
380 	     shift to the next possible match location.  */
381 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
382 	  if (0 < shift)
383 	    {
384 	      j += shift;
385 	      continue;
386 	    }
387 	  /* Scan for matches in right half.  The last byte has
388 	     already been matched, by virtue of the shift table.  */
389 	  i = suffix;
390 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
391 					== CANON_ELEMENT (haystack[i + j])))
392 	    ++i;
393 	  if (needle_len - 1 <= i)
394 	    {
395 	      /* Scan for matches in left half.  */
396 	      i = suffix - 1;
397 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
398 				       == CANON_ELEMENT (haystack[i + j])))
399 		--i;
400 	      if (i == SIZE_MAX)
401 		return (RETURN_TYPE) (haystack + j);
402 	      j += period;
403 	    }
404 	  else
405 	    j += i - suffix + 1;
406 	}
407     }
408   return NULL;
409 }
410 
411 #undef AVAILABLE
412 #undef CANON_ELEMENT
413 #undef CMP_FUNC
414 #undef MAX
415 #undef RETURN_TYPE
416