1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
use std::fmt;
use std::panic::{RefUnwindSafe, UnwindSafe};

use memchr::{memchr, memchr2, memchr3};

/// A prefilter describes the behavior of fast literal scanners for quickly
/// skipping past bytes in the haystack that we know cannot possibly
/// participate in a match.
pub trait Prefilter: Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug {
    /// Returns the next possible match candidate. This may yield false
    /// positives, so callers must "confirm" a match starting at the position
    /// returned. This, however, must never produce false negatives. That is,
    /// this must, at minimum, return the starting position of the next match
    /// in the given haystack after or at the given position.
    fn next_candidate(&self, haystack: &[u8], at: usize) -> Option<usize>;

    /// A method for cloning a prefilter, to work-around the fact that Clone
    /// is not object-safe.
    fn clone_prefilter(&self) -> Box<Prefilter>;
}

/// A convenience object for representing any type that implements Prefilter
/// and is cloneable.
#[derive(Debug)]
pub struct PrefilterObj(Box<Prefilter>);

impl Clone for PrefilterObj {
    fn clone(&self) -> Self {
        PrefilterObj(self.0.clone_prefilter())
    }
}

impl PrefilterObj {
    /// Create a new prefilter object.
    pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
        PrefilterObj(Box::new(t))
    }

    /// Return the underlying prefilter trait object.
    pub fn as_ref(&self) -> &Prefilter {
        &*self.0
    }
}

/// PrefilterState tracks state associated with the effectiveness of a
/// prefilter. It is used to track how many bytes, on average, are skipped by
/// the prefilter. If this average dips below a certain threshold over time,
/// then the state renders the prefilter inert and stops using it.
///
/// A prefilter state should be created for each search. (Where creating an
/// iterator via, e.g., `find_iter`, is treated as a single search.)
#[derive(Clone, Debug)]
pub struct PrefilterState {
    /// The number of skips that has been executed.
    skips: usize,
    /// The total number of bytes that have been skipped.
    skipped: usize,
    /// The maximum length of a match. This is used to help determine how many
    /// bytes on average should be skipped in order for a prefilter to be
    /// effective.
    max_match_len: usize,
    /// Once this heuristic has been deemed ineffective, it will be inert
    /// throughout the rest of its lifetime. This serves as a cheap way to
    /// check inertness.
    inert: bool,
}

impl PrefilterState {
    /// The minimum number of skip attempts to try before considering whether
    /// a prefilter is effective or not.
    const MIN_SKIPS: usize = 40;

    /// The minimum amount of bytes that skipping must average, expressed as
    /// a factor of the multiple of the length of a possible match.
    ///
    /// That is, after MIN_SKIPS have occurred, if the average number of bytes
    /// skipped ever falls below MIN_AVG_FACTOR, then this searcher is rendered
    /// inert.
    const MIN_AVG_FACTOR: usize = 2;

    /// Create a fresh prefilter state.
    pub fn new(max_match_len: usize) -> PrefilterState {
        PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false }
    }

    /// Update this state with the number of bytes skipped on the last
    /// invocation of the prefilter.
    #[inline]
    pub fn update(&mut self, skipped: usize) {
        self.skips += 1;
        self.skipped += skipped;
    }

    /// Return true if and only if this state indicates that a prefilter is
    /// still effective.
    #[inline]
    pub fn is_effective(&mut self) -> bool {
        if self.inert {
            return false;
        }
        if self.skips < PrefilterState::MIN_SKIPS {
            return true;
        }

        let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
        if self.skipped >= min_avg * self.skips {
            return true;
        }

        // We're inert.
        self.inert = true;
        false
    }
}

/// A builder for construction a starting byte prefilter.
///
/// A starting byte prefilter is a simplistic prefilter that looks for possible
/// matches by reporting all positions corresponding to a particular byte. This
/// generally only takes affect when there are at most 3 distinct possible
/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
/// distinct starting bytes (`f` and `b`), and this prefiler returns all
/// occurrences of either `f` or `b`.
///
/// In some cases, a heuristic frequency analysis may determine that it would
/// be better not to use this prefilter even when there are 3 or fewer distinct
/// starting bytes.
#[derive(Clone, Debug)]
pub struct StartBytesBuilder {
    byteset: Vec<bool>,
}

impl StartBytesBuilder {
    /// Create a new builder for constructing a start byte prefilter.
    pub fn new() -> StartBytesBuilder {
        StartBytesBuilder { byteset: vec![false; 256] }
    }

    /// Build the starting bytes prefilter.
    ///
    /// If there are more than 3 distinct starting bytes, or if heuristics
    /// otherwise determine that this prefilter should not be used, then `None`
    /// is returned.
    pub fn build(&self) -> Option<PrefilterObj> {
        let (mut bytes, mut len) = ([0; 3], 0);
        for b in 0..256 {
            if !self.byteset[b] {
                continue;
            }
            // We've exceeded our limit, so bail.
            if len == 3 {
                return None;
            }
            // We don't handle non-ASCII bytes for now. Getting non-ASCII
            // bytes right is trickier, since we generally don't want to put
            // a leading UTF-8 code unit into a prefilter that isn't ASCII,
            // since they can frequently. Instead, it would be better to use a
            // continuation byte, but this requires more sophisticated analysis
            // of the automaton and a richer prefilter API.
            if b > 0x7F {
                return None;
            }
            bytes[len] = b as u8;
            len += 1;
        }
        match len {
            0 => None,
            1 => {
                Some(PrefilterObj::new(StartBytesOne {
                    byte1: bytes[0],
                }))
            }
            2 => {
                Some(PrefilterObj::new(StartBytesTwo {
                    byte1: bytes[0],
                    byte2: bytes[1],
                }))
            }
            3 => {
                Some(PrefilterObj::new(StartBytesThree {
                    byte1: bytes[0],
                    byte2: bytes[1],
                    byte3: bytes[2],
                }))
            }
            _ => unreachable!(),
        }
    }

    /// Add a starting byte to this builder.
    ///
    /// In general, all possible starting bytes for an automaton should be
    /// added to this builder before attempting to construct the prefilter.
    pub fn add(&mut self, byte: u8) {
        self.byteset[byte as usize] = true;
    }
}

/// A prefilter for scanning for a single starting byte.
#[derive(Clone, Debug)]
pub struct StartBytesOne {
    byte1: u8,
}

impl Prefilter for StartBytesOne {
    fn next_candidate(&self, haystack: &[u8], at: usize) -> Option<usize> {
        memchr(self.byte1, &haystack[at..])
            .map(|i| at + i)
    }

    fn clone_prefilter(&self) -> Box<Prefilter> {
        Box::new(self.clone())
    }
}

/// A prefilter for scanning for two starting bytes.
#[derive(Clone, Debug)]
pub struct StartBytesTwo {
    byte1: u8,
    byte2: u8,
}

impl Prefilter for StartBytesTwo {
    fn next_candidate(&self, haystack: &[u8], at: usize) -> Option<usize> {
        memchr2(self.byte1, self.byte2, &haystack[at..])
            .map(|i| at + i)
    }

    fn clone_prefilter(&self) -> Box<Prefilter> {
        Box::new(self.clone())
    }
}

/// A prefilter for scanning for three starting bytes.
#[derive(Clone, Debug)]
pub struct StartBytesThree {
    byte1: u8,
    byte2: u8,
    byte3: u8,
}

impl Prefilter for StartBytesThree {
    fn next_candidate(&self, haystack: &[u8], at: usize) -> Option<usize> {
        memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
            .map(|i| at + i)
    }

    fn clone_prefilter(&self) -> Box<Prefilter> {
        Box::new(self.clone())
    }
}