268 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
		
		
			
		
	
	
			268 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
|  | 'use strict'; | ||
|  | /* | ||
|  |   Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation | ||
|  |   by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
 | ||
|  | */ | ||
|  | function memcmp(buf1, pos1, buf2, pos2, num) { | ||
|  |   for (let i = 0; i < num; ++i) { | ||
|  |     if (buf1[pos1 + i] !== buf2[pos2 + i]) | ||
|  |       return false; | ||
|  |   } | ||
|  |   return true; | ||
|  | } | ||
|  | 
 | ||
|  | class SBMH { | ||
|  |   constructor(needle, cb) { | ||
|  |     if (typeof cb !== 'function') | ||
|  |       throw new Error('Missing match callback'); | ||
|  | 
 | ||
|  |     if (typeof needle === 'string') | ||
|  |       needle = Buffer.from(needle); | ||
|  |     else if (!Buffer.isBuffer(needle)) | ||
|  |       throw new Error(`Expected Buffer for needle, got ${typeof needle}`); | ||
|  | 
 | ||
|  |     const needleLen = needle.length; | ||
|  | 
 | ||
|  |     this.maxMatches = Infinity; | ||
|  |     this.matches = 0; | ||
|  | 
 | ||
|  |     this._cb = cb; | ||
|  |     this._lookbehindSize = 0; | ||
|  |     this._needle = needle; | ||
|  |     this._bufPos = 0; | ||
|  | 
 | ||
|  |     this._lookbehind = Buffer.allocUnsafe(needleLen); | ||
|  | 
 | ||
|  |     // Initialize occurrence table.
 | ||
|  |     this._occ = [ | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, | ||
|  |       needleLen, needleLen, needleLen, needleLen | ||
|  |     ]; | ||
|  | 
 | ||
|  |     // Populate occurrence table with analysis of the needle, ignoring the last
 | ||
|  |     // letter.
 | ||
|  |     if (needleLen > 1) { | ||
|  |       for (let i = 0; i < needleLen - 1; ++i) | ||
|  |         this._occ[needle[i]] = needleLen - 1 - i; | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   reset() { | ||
|  |     this.matches = 0; | ||
|  |     this._lookbehindSize = 0; | ||
|  |     this._bufPos = 0; | ||
|  |   } | ||
|  | 
 | ||
|  |   push(chunk, pos) { | ||
|  |     let result; | ||
|  |     if (!Buffer.isBuffer(chunk)) | ||
|  |       chunk = Buffer.from(chunk, 'latin1'); | ||
|  |     const chunkLen = chunk.length; | ||
|  |     this._bufPos = pos || 0; | ||
|  |     while (result !== chunkLen && this.matches < this.maxMatches) | ||
|  |       result = feed(this, chunk); | ||
|  |     return result; | ||
|  |   } | ||
|  | 
 | ||
|  |   destroy() { | ||
|  |     const lbSize = this._lookbehindSize; | ||
|  |     if (lbSize) | ||
|  |       this._cb(false, this._lookbehind, 0, lbSize, false); | ||
|  |     this.reset(); | ||
|  |   } | ||
|  | } | ||
|  | 
 | ||
|  | function feed(self, data) { | ||
|  |   const len = data.length; | ||
|  |   const needle = self._needle; | ||
|  |   const needleLen = needle.length; | ||
|  | 
 | ||
|  |   // Positive: points to a position in `data`
 | ||
|  |   //           pos == 3 points to data[3]
 | ||
|  |   // Negative: points to a position in the lookbehind buffer
 | ||
|  |   //           pos == -2 points to lookbehind[lookbehindSize - 2]
 | ||
|  |   let pos = -self._lookbehindSize; | ||
|  |   const lastNeedleCharPos = needleLen - 1; | ||
|  |   const lastNeedleChar = needle[lastNeedleCharPos]; | ||
|  |   const end = len - needleLen; | ||
|  |   const occ = self._occ; | ||
|  |   const lookbehind = self._lookbehind; | ||
|  | 
 | ||
|  |   if (pos < 0) { | ||
|  |     // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
 | ||
|  |     // search with character lookup code that considers both the
 | ||
|  |     // lookbehind buffer and the current round's haystack data.
 | ||
|  |     //
 | ||
|  |     // Loop until
 | ||
|  |     //   there is a match.
 | ||
|  |     // or until
 | ||
|  |     //   we've moved past the position that requires the
 | ||
|  |     //   lookbehind buffer. In this case we switch to the
 | ||
|  |     //   optimized loop.
 | ||
|  |     // or until
 | ||
|  |     //   the character to look at lies outside the haystack.
 | ||
|  |     while (pos < 0 && pos <= end) { | ||
|  |       const nextPos = pos + lastNeedleCharPos; | ||
|  |       const ch = (nextPos < 0 | ||
|  |                   ? lookbehind[self._lookbehindSize + nextPos] | ||
|  |                   : data[nextPos]); | ||
|  | 
 | ||
|  |       if (ch === lastNeedleChar | ||
|  |           && matchNeedle(self, data, pos, lastNeedleCharPos)) { | ||
|  |         self._lookbehindSize = 0; | ||
|  |         ++self.matches; | ||
|  |         if (pos > -self._lookbehindSize) | ||
|  |           self._cb(true, lookbehind, 0, self._lookbehindSize + pos, false); | ||
|  |         else | ||
|  |           self._cb(true, undefined, 0, 0, true); | ||
|  | 
 | ||
|  |         return (self._bufPos = pos + needleLen); | ||
|  |       } | ||
|  | 
 | ||
|  |       pos += occ[ch]; | ||
|  |     } | ||
|  | 
 | ||
|  |     // No match.
 | ||
|  | 
 | ||
|  |     // There's too few data for Boyer-Moore-Horspool to run,
 | ||
|  |     // so let's use a different algorithm to skip as much as
 | ||
|  |     // we can.
 | ||
|  |     // Forward pos until
 | ||
|  |     //   the trailing part of lookbehind + data
 | ||
|  |     //   looks like the beginning of the needle
 | ||
|  |     // or until
 | ||
|  |     //   pos == 0
 | ||
|  |     while (pos < 0 && !matchNeedle(self, data, pos, len - pos)) | ||
|  |       ++pos; | ||
|  | 
 | ||
|  |     if (pos < 0) { | ||
|  |       // Cut off part of the lookbehind buffer that has
 | ||
|  |       // been processed and append the entire haystack
 | ||
|  |       // into it.
 | ||
|  |       const bytesToCutOff = self._lookbehindSize + pos; | ||
|  | 
 | ||
|  |       if (bytesToCutOff > 0) { | ||
|  |         // The cut off data is guaranteed not to contain the needle.
 | ||
|  |         self._cb(false, lookbehind, 0, bytesToCutOff, false); | ||
|  |       } | ||
|  | 
 | ||
|  |       self._lookbehindSize -= bytesToCutOff; | ||
|  |       lookbehind.copy(lookbehind, 0, bytesToCutOff, self._lookbehindSize); | ||
|  |       lookbehind.set(data, self._lookbehindSize); | ||
|  |       self._lookbehindSize += len; | ||
|  | 
 | ||
|  |       self._bufPos = len; | ||
|  |       return len; | ||
|  |     } | ||
|  | 
 | ||
|  |     // Discard lookbehind buffer.
 | ||
|  |     self._cb(false, lookbehind, 0, self._lookbehindSize, false); | ||
|  |     self._lookbehindSize = 0; | ||
|  |   } | ||
|  | 
 | ||
|  |   pos += self._bufPos; | ||
|  | 
 | ||
|  |   const firstNeedleChar = needle[0]; | ||
|  | 
 | ||
|  |   // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool
 | ||
|  |   // search with optimized character lookup code that only considers
 | ||
|  |   // the current round's haystack data.
 | ||
|  |   while (pos <= end) { | ||
|  |     const ch = data[pos + lastNeedleCharPos]; | ||
|  | 
 | ||
|  |     if (ch === lastNeedleChar | ||
|  |         && data[pos] === firstNeedleChar | ||
|  |         && memcmp(needle, 0, data, pos, lastNeedleCharPos)) { | ||
|  |       ++self.matches; | ||
|  |       if (pos > 0) | ||
|  |         self._cb(true, data, self._bufPos, pos, true); | ||
|  |       else | ||
|  |         self._cb(true, undefined, 0, 0, true); | ||
|  | 
 | ||
|  |       return (self._bufPos = pos + needleLen); | ||
|  |     } | ||
|  | 
 | ||
|  |     pos += occ[ch]; | ||
|  |   } | ||
|  | 
 | ||
|  |   // There was no match. If there's trailing haystack data that we cannot
 | ||
|  |   // match yet using the Boyer-Moore-Horspool algorithm (because the trailing
 | ||
|  |   // data is less than the needle size) then match using a modified
 | ||
|  |   // algorithm that starts matching from the beginning instead of the end.
 | ||
|  |   // Whatever trailing data is left after running this algorithm is added to
 | ||
|  |   // the lookbehind buffer.
 | ||
|  |   while (pos < len) { | ||
|  |     if (data[pos] !== firstNeedleChar | ||
|  |         || !memcmp(data, pos, needle, 0, len - pos)) { | ||
|  |       ++pos; | ||
|  |       continue; | ||
|  |     } | ||
|  |     data.copy(lookbehind, 0, pos, len); | ||
|  |     self._lookbehindSize = len - pos; | ||
|  |     break; | ||
|  |   } | ||
|  | 
 | ||
|  |   // Everything until `pos` is guaranteed not to contain needle data.
 | ||
|  |   if (pos > 0) | ||
|  |     self._cb(false, data, self._bufPos, pos < len ? pos : len, true); | ||
|  | 
 | ||
|  |   self._bufPos = len; | ||
|  |   return len; | ||
|  | } | ||
|  | 
 | ||
|  | function matchNeedle(self, data, pos, len) { | ||
|  |   const lb = self._lookbehind; | ||
|  |   const lbSize = self._lookbehindSize; | ||
|  |   const needle = self._needle; | ||
|  | 
 | ||
|  |   for (let i = 0; i < len; ++i, ++pos) { | ||
|  |     const ch = (pos < 0 ? lb[lbSize + pos] : data[pos]); | ||
|  |     if (ch !== needle[i]) | ||
|  |       return false; | ||
|  |   } | ||
|  |   return true; | ||
|  | } | ||
|  | 
 | ||
|  | module.exports = SBMH; |