1 /* Byte-wise substring search, using the Two-Way algorithm.
2  * Copyright (C) 2008, 2010 Eric Blake
3  * Permission to use, copy, modify, and distribute this software
4  * is freely granted, provided that this notice is preserved.
5  */
6 
7 
8 /* Before including this file, you need to include <string.h>, and define:
9      RESULT_TYPE		A macro that expands to the return type.
10      AVAILABLE(h, h_l, j, n_l)	A macro that returns nonzero if there are
11 				at least N_L bytes left starting at
12 				H[J].  H is 'unsigned char *', H_L, J,
13 				and N_L are 'size_t'; H_L is an
14 				lvalue.  For NUL-terminated searches,
15 				H_L can be modified each iteration to
16 				avoid having to compute the end of H
17 				up front.
18 
19   For case-insensitivity, you may optionally define:
20      CMP_FUNC(p1, p2, l)	A macro that returns 0 iff the first L
21 				characters of P1 and P2 are equal.
22      CANON_ELEMENT(c)		A macro that canonicalizes an element
23 				right after it has been fetched from
24 				one of the two strings.  The argument
25 				is an 'unsigned char'; the result must
26 				be an 'unsigned char' as well.
27 
28   This file undefines the macros documented above, and defines
29   LONG_NEEDLE_THRESHOLD.
30 */
31 
32 #include <limits.h>
33 #include <stdint.h>
34 #include <_ansi.h>
35 
36 /* We use the Two-Way string matching algorithm, which guarantees
37    linear complexity with constant space.  Additionally, for long
38    needles, we also use a bad character shift table similar to the
39    Boyer-Moore algorithm to achieve improved (potentially sub-linear)
40    performance.
41 
42    See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
43    and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
44 */
45 
46 /* Point at which computing a bad-byte shift table is likely to be
47    worthwhile.  Small needles should not compute a table, since it
48    adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
49    speedup no greater than a factor of NEEDLE_LEN.  The larger the
50    needle, the better the potential performance gain.  On the other
51    hand, on non-POSIX systems with CHAR_BIT larger than eight, the
52    memory required for the table is prohibitive.  */
53 #if CHAR_BIT < 10
54 # define LONG_NEEDLE_THRESHOLD 32U
55 #else
56 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
57 #endif
58 
59 #define MAX(a, b) ((a < b) ? (b) : (a))
60 
61 #ifndef CANON_ELEMENT
62 # define CANON_ELEMENT(c) c
63 #endif
64 #ifndef CMP_FUNC
65 # define CMP_FUNC memcmp
66 #endif
67 
68 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
69    Return the index of the first byte in the right half, and set
70    *PERIOD to the global period of the right half.
71 
72    The global period of a string is the smallest index (possibly its
73    length) at which all remaining bytes in the string are repetitions
74    of the prefix (the last repetition may be a subset of the prefix).
75 
76    When NEEDLE is factored into two halves, a local period is the
77    length of the smallest word that shares a suffix with the left half
78    and shares a prefix with the right half.  All factorizations of a
79    non-empty NEEDLE have a local period of at least 1 and no greater
80    than NEEDLE_LEN.
81 
82    A critical factorization has the property that the local period
83    equals the global period.  All strings have at least one critical
84    factorization with the left half smaller than the global period.
85 
86    Given an ordered alphabet, a critical factorization can be computed
87    in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
88    larger of two ordered maximal suffixes.  The ordered maximal
89    suffixes are determined by lexicographic comparison of
90    periodicity.  */
91 static size_t
critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)92 critical_factorization (const unsigned char *needle, size_t needle_len,
93 			size_t *period)
94 {
95   /* Index of last byte of left half, or SIZE_MAX.  */
96   size_t max_suffix, max_suffix_rev;
97   size_t j; /* Index into NEEDLE for current candidate suffix.  */
98   size_t k; /* Offset into current period.  */
99   size_t p; /* Intermediate period.  */
100   unsigned char a, b; /* Current comparison bytes.  */
101 
102   /* Invariants:
103      0 <= j < NEEDLE_LEN - 1
104      -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
105      min(max_suffix, max_suffix_rev) < global period of NEEDLE
106      1 <= p <= global period of NEEDLE
107      p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
108      1 <= k <= p
109   */
110 
111   /* Perform lexicographic search.  */
112   max_suffix = SIZE_MAX;
113   j = 0;
114   k = p = 1;
115   while (j + k < needle_len)
116     {
117       a = CANON_ELEMENT (needle[j + k]);
118       b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
119       if (a < b)
120 	{
121 	  /* Suffix is smaller, period is entire prefix so far.  */
122 	  j += k;
123 	  k = 1;
124 	  p = j - max_suffix;
125 	}
126       else if (a == b)
127 	{
128 	  /* Advance through repetition of the current period.  */
129 	  if (k != p)
130 	    ++k;
131 	  else
132 	    {
133 	      j += p;
134 	      k = 1;
135 	    }
136 	}
137       else /* b < a */
138 	{
139 	  /* Suffix is larger, start over from current location.  */
140 	  max_suffix = j++;
141 	  k = p = 1;
142 	}
143     }
144   *period = p;
145 
146   /* Perform reverse lexicographic search.  */
147   max_suffix_rev = SIZE_MAX;
148   j = 0;
149   k = p = 1;
150   while (j + k < needle_len)
151     {
152       a = CANON_ELEMENT (needle[j + k]);
153       b = CANON_ELEMENT (needle[max_suffix_rev + k]);
154       if (b < a)
155 	{
156 	  /* Suffix is smaller, period is entire prefix so far.  */
157 	  j += k;
158 	  k = 1;
159 	  p = j - max_suffix_rev;
160 	}
161       else if (a == b)
162 	{
163 	  /* Advance through repetition of the current period.  */
164 	  if (k != p)
165 	    ++k;
166 	  else
167 	    {
168 	      j += p;
169 	      k = 1;
170 	    }
171 	}
172       else /* a < b */
173 	{
174 	  /* Suffix is larger, start over from current location.  */
175 	  max_suffix_rev = j++;
176 	  k = p = 1;
177 	}
178     }
179 
180   /* Choose the longer suffix.  Return the first byte of the right
181      half, rather than the last byte of the left half.  */
182   if (max_suffix_rev + 1 < max_suffix + 1)
183     return max_suffix + 1;
184   *period = p;
185   return max_suffix_rev + 1;
186 }
187 
188 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
189    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
190    method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
191    Performance is guaranteed to be linear, with an initialization cost
192    of 2 * NEEDLE_LEN comparisons.
193 
194    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
195    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
196    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
197    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
198 static inline RETURN_TYPE
two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)199 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
200 		      const unsigned char *needle, size_t needle_len)
201 {
202   size_t i; /* Index into current byte of NEEDLE.  */
203   size_t j; /* Index into current window of HAYSTACK.  */
204   size_t period; /* The period of the right half of needle.  */
205   size_t suffix; /* The index of the right half of needle.  */
206 
207   /* Factor the needle into two halves, such that the left half is
208      smaller than the global period, and the right half is
209      periodic (with a period as large as NEEDLE_LEN - suffix).  */
210   suffix = critical_factorization (needle, needle_len, &period);
211 
212   /* Perform the search.  Each iteration compares the right half
213      first.  */
214   if (CMP_FUNC (needle, needle + period, suffix) == 0)
215     {
216       /* Entire needle is periodic; a mismatch can only advance by the
217 	 period, so use memory to avoid rescanning known occurrences
218 	 of the period.  */
219       size_t memory = 0;
220       j = 0;
221       while (AVAILABLE (haystack, haystack_len, j, needle_len))
222 	{
223 	  /* Scan for matches in right half.  */
224 	  i = MAX (suffix, memory);
225 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
226 				    == CANON_ELEMENT (haystack[i + j])))
227 	    ++i;
228 	  if (needle_len <= i)
229 	    {
230 	      /* Scan for matches in left half.  */
231 	      i = suffix - 1;
232 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
233 					== CANON_ELEMENT (haystack[i + j])))
234 		--i;
235 	      if (i + 1 < memory + 1)
236 		return (RETURN_TYPE) (haystack + j);
237 	      /* No match, so remember how many repetitions of period
238 		 on the right half were scanned.  */
239 	      j += period;
240 	      memory = needle_len - period;
241 	    }
242 	  else
243 	    {
244 	      j += i - suffix + 1;
245 	      memory = 0;
246 	    }
247 	}
248     }
249   else
250     {
251       /* The two halves of needle are distinct; no extra memory is
252 	 required, and any mismatch results in a maximal shift.  */
253       period = MAX (suffix, needle_len - suffix) + 1;
254       j = 0;
255       while (AVAILABLE (haystack, haystack_len, j, needle_len))
256 	{
257 	  /* Scan for matches in right half.  */
258 	  i = suffix;
259 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
260 				    == CANON_ELEMENT (haystack[i + j])))
261 	    ++i;
262 	  if (needle_len <= i)
263 	    {
264 	      /* Scan for matches in left half.  */
265 	      i = suffix - 1;
266 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
267 				       == CANON_ELEMENT (haystack[i + j])))
268 		--i;
269 	      if (i == SIZE_MAX)
270 		return (RETURN_TYPE) (haystack + j);
271 	      j += period;
272 	    }
273 	  else
274 	    j += i - suffix + 1;
275 	}
276     }
277   return NULL;
278 }
279 
280 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
281    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
282    method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
283    Performance is guaranteed to be linear, with an initialization cost
284    of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
285 
286    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
287    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
288    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
289    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
290    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
291    sublinear performance is not possible.  */
292 _NOINLINE_STATIC RETURN_TYPE __attribute__ ((__used__))
two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)293 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
294 		     const unsigned char *needle, size_t needle_len)
295 {
296   size_t i; /* Index into current byte of NEEDLE.  */
297   size_t j; /* Index into current window of HAYSTACK.  */
298   size_t period; /* The period of the right half of needle.  */
299   size_t suffix; /* The index of the right half of needle.  */
300   size_t shift_table[1U << CHAR_BIT]; /* See below.  */
301 
302   /* Factor the needle into two halves, such that the left half is
303      smaller than the global period, and the right half is
304      periodic (with a period as large as NEEDLE_LEN - suffix).  */
305   suffix = critical_factorization (needle, needle_len, &period);
306 
307   /* Populate shift_table.  For each possible byte value c,
308      shift_table[c] is the distance from the last occurrence of c to
309      the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
310      shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
311   for (i = 0; i < 1U << CHAR_BIT; i++)
312     shift_table[i] = needle_len;
313   for (i = 0; i < needle_len; i++)
314     shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
315 
316   /* Perform the search.  Each iteration compares the right half
317      first.  */
318   if (CMP_FUNC (needle, needle + period, suffix) == 0)
319     {
320       /* Entire needle is periodic; a mismatch can only advance by the
321 	 period, so use memory to avoid rescanning known occurrences
322 	 of the period.  */
323       size_t memory = 0;
324       size_t shift;
325       j = 0;
326       while (AVAILABLE (haystack, haystack_len, j, needle_len))
327 	{
328 	  /* Check the last byte first; if it does not match, then
329 	     shift to the next possible match location.  */
330 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
331 	  if (0 < shift)
332 	    {
333 	      if (memory && shift < period)
334 		{
335 		  /* Since needle is periodic, but the last period has
336 		     a byte out of place, there can be no match until
337 		     after the mismatch.  */
338 		  shift = needle_len - period;
339 		}
340 	      memory = 0;
341 	      j += shift;
342 	      continue;
343 	    }
344 	  /* Scan for matches in right half.  The last byte has
345 	     already been matched, by virtue of the shift table.  */
346 	  i = MAX (suffix, memory);
347 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
348 					== CANON_ELEMENT (haystack[i + j])))
349 	    ++i;
350 	  if (needle_len - 1 <= i)
351 	    {
352 	      /* Scan for matches in left half.  */
353 	      i = suffix - 1;
354 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
355 					== CANON_ELEMENT (haystack[i + j])))
356 		--i;
357 	      if (i + 1 < memory + 1)
358 		return (RETURN_TYPE) (haystack + j);
359 	      /* No match, so remember how many repetitions of period
360 		 on the right half were scanned.  */
361 	      j += period;
362 	      memory = needle_len - period;
363 	    }
364 	  else
365 	    {
366 	      j += i - suffix + 1;
367 	      memory = 0;
368 	    }
369 	}
370     }
371   else
372     {
373       /* The two halves of needle are distinct; no extra memory is
374 	 required, and any mismatch results in a maximal shift.  */
375       size_t shift;
376       period = MAX (suffix, needle_len - suffix) + 1;
377       j = 0;
378       while (AVAILABLE (haystack, haystack_len, j, needle_len))
379 	{
380 	  /* Check the last byte first; if it does not match, then
381 	     shift to the next possible match location.  */
382 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
383 	  if (0 < shift)
384 	    {
385 	      j += shift;
386 	      continue;
387 	    }
388 	  /* Scan for matches in right half.  The last byte has
389 	     already been matched, by virtue of the shift table.  */
390 	  i = suffix;
391 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
392 					== CANON_ELEMENT (haystack[i + j])))
393 	    ++i;
394 	  if (needle_len - 1 <= i)
395 	    {
396 	      /* Scan for matches in left half.  */
397 	      i = suffix - 1;
398 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
399 				       == CANON_ELEMENT (haystack[i + j])))
400 		--i;
401 	      if (i == SIZE_MAX)
402 		return (RETURN_TYPE) (haystack + j);
403 	      j += period;
404 	    }
405 	  else
406 	    j += i - suffix + 1;
407 	}
408     }
409   return NULL;
410 }
411 
412 #undef AVAILABLE
413 #undef CANON_ELEMENT
414 #undef CMP_FUNC
415 #undef MAX
416 #undef RETURN_TYPE
417