1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
//! This module contains data structures for representing syntax definitions.
//! Everything is public because I want this library to be useful in super
//! integrated cases like text editors and I have no idea what kind of monkeying
//! you might want to do with the data. Perhaps parsing your own syntax format
//! into this data structure?
use std::collections::{BTreeMap, HashMap};
use std::hash::Hash;
use onig::{self, Regex, Region, Syntax};
use std::rc::{Rc, Weak};
use std::cell::RefCell;
use super::scope::*;
use regex_syntax::escape;
use serde::{Deserialize, Deserializer, Serialize, Serializer};

pub type CaptureMapping = Vec<(usize, Vec<Scope>)>;
pub type ContextPtr = Rc<RefCell<Context>>;

/// The main data structure representing a syntax definition loaded from a
/// `.sublime-syntax` file. You'll probably only need these as references
/// to be passed around to parsing code.
///
/// Some useful public fields are the `name` field which is a human readable
/// name to display in syntax lists, and the `hidden` field which means hide
/// this syntax from any lists because it is for internal use.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct SyntaxDefinition {
    pub name: String,
    pub file_extensions: Vec<String>,
    pub scope: Scope,
    pub first_line_match: Option<String>,
    pub hidden: bool,
    /// Filled in at link time to avoid serializing it multiple times
    #[serde(skip_serializing, skip_deserializing)]
    pub prototype: Option<ContextPtr>,

    #[serde(serialize_with = "ordered_map")]
    pub variables: HashMap<String, String>,
    #[serde(serialize_with = "ordered_map")]
    pub contexts: HashMap<String, ContextPtr>,
}

#[derive(Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct Context {
    pub meta_scope: Vec<Scope>,
    pub meta_content_scope: Vec<Scope>,
    /// This being set false in the syntax file implies this field being set false,
    /// but it can also be set falso for contexts that don't include the prototype for other reasons
    pub meta_include_prototype: bool,
    pub clear_scopes: Option<ClearAmount>,
    /// This is filled in by the linker at link time
    /// for contexts that have `meta_include_prototype==true`
    /// and are not included from the prototype.
    #[serde(skip_serializing, skip_deserializing)]
    pub prototype: Option<ContextPtr>,
    pub uses_backrefs: bool,

    pub patterns: Vec<Pattern>,
}

#[derive(Debug, Eq, PartialEq, Serialize, Deserialize)]
pub enum Pattern {
    Match(MatchPattern),
    Include(ContextReference),
}

/// Used to iterate over all the match patterns in a context.
/// Basically walks the tree of patterns and include directives
/// in the correct order.
#[derive(Debug)]
pub struct MatchIter {
    ctx_stack: Vec<ContextPtr>,
    index_stack: Vec<usize>,
}

#[derive(Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct MatchPattern {
    pub has_captures: bool,
    pub regex_str: String,
    #[serde(skip_serializing, skip_deserializing)]
    pub regex: Option<Regex>,
    pub scope: Vec<Scope>,
    pub captures: Option<CaptureMapping>,
    pub operation: MatchOperation,
    pub with_prototype: Option<ContextPtr>,
}

/// This wrapper only exists so that I can implement a serialization
/// trait that crashes if you try and serialize this.
#[derive(Debug)]
pub struct LinkerLink {
    pub link: Weak<RefCell<Context>>,
}

#[derive(Debug, Eq, PartialEq, Serialize, Deserialize)]
pub enum ContextReference {
    Named(String),
    ByScope {
        scope: Scope,
        sub_context: Option<String>,
    },
    File {
        name: String,
        sub_context: Option<String>,
    },
    Inline(ContextPtr),
    Direct(LinkerLink),
}

#[derive(Debug, Eq, PartialEq, Serialize, Deserialize)]
pub enum MatchOperation {
    Push(Vec<ContextReference>),
    Set(Vec<ContextReference>),
    Pop,
    None,
}

impl Iterator for MatchIter {
    type Item = (ContextPtr, usize);

    fn next(&mut self) -> Option<(ContextPtr, usize)> {
        loop {
            if self.ctx_stack.is_empty() {
                return None;
            }
            // uncomment for debugging infinite recursion
            // println!("{:?}", self.index_stack);
            // use std::thread::sleep_ms;
            // sleep_ms(500);
            let last_index = self.ctx_stack.len() - 1;
            let context_ref = self.ctx_stack[last_index].clone();
            let context = context_ref.borrow();
            let index = self.index_stack[last_index];
            self.index_stack[last_index] = index + 1;
            if index < context.patterns.len() {
                match context.patterns[index] {
                    Pattern::Match(_) => return Some((context_ref.clone(), index)),
                    Pattern::Include(ref ctx_ref) => {
                        let ctx_ptr = match *ctx_ref {
                            ContextReference::Inline(ref ctx_ptr) => ctx_ptr.clone(),
                            ContextReference::Direct(ref ctx_ptr) => {
                                ctx_ptr.link.upgrade().unwrap()
                            }
                            _ => return self.next(), // skip this and move onto the next one
                        };
                        self.ctx_stack.push(ctx_ptr);
                        self.index_stack.push(0);
                    }
                }
            } else {
                self.ctx_stack.pop();
                self.index_stack.pop();
            }
        }
    }
}

/// Returns an iterator over all the match patterns in this context.
/// It recursively follows include directives. Can only be run on
/// contexts that have already been linked up.
pub fn context_iter(ctx: ContextPtr) -> MatchIter {
    MatchIter {
        ctx_stack: vec![ctx],
        index_stack: vec![0],
    }
}

impl Context {
    /// Returns the match pattern at an index, panics if the thing isn't a match pattern
    pub fn match_at(&self, index: usize) -> &MatchPattern {
        match self.patterns[index] {
            Pattern::Match(ref match_pat) => match_pat,
            _ => panic!("bad index to match_at"),
        }
    }

    /// Returns a mutable reference, otherwise like `match_at`
    pub fn match_at_mut(&mut self, index: usize) -> &mut MatchPattern {
        match self.patterns[index] {
            Pattern::Match(ref mut match_pat) => match_pat,
            _ => panic!("bad index to match_at"),
        }
    }
}

impl ContextReference {
    /// find the pointed to context, panics if ref is not linked
    pub fn resolve(&self) -> ContextPtr {
        match *self {
            ContextReference::Inline(ref ptr) => ptr.clone(),
            ContextReference::Direct(ref ptr) => ptr.link.upgrade().unwrap(),
            _ => panic!("Can only call resolve on linked references: {:?}", self),
        }
    }
}

impl MatchPattern {
    /// substitutes back-refs in Regex with regions from s
    /// used for match patterns which refer to captures from the pattern
    /// that pushed them.
    pub fn regex_with_substitutes(&self, region: &Region, s: &str) -> String {
        let mut reg_str = String::new();

        let mut last_was_escape = false;
        for c in self.regex_str.chars() {
            if last_was_escape && c.is_digit(10) {
                let val = c.to_digit(10).unwrap();
                if let Some((start, end)) = region.pos(val as usize) {
                    let escaped = escape(&s[start..end]);
                    reg_str.push_str(&escaped);
                }
            } else if last_was_escape {
                reg_str.push('\\');
                reg_str.push(c);
            } else if c != '\\' {
                reg_str.push(c);
            }

            last_was_escape = c == '\\' && !last_was_escape;
        }
        reg_str
    }

    /// Used by the parser to compile a regex which needs to reference
    /// regions from another matched pattern.
    pub fn compile_with_refs(&self, region: &Region, s: &str) -> Regex {
        // TODO don't panic on invalid regex
        Regex::with_options(&self.regex_with_substitutes(region, s),
                            onig::REGEX_OPTION_CAPTURE_GROUP,
                            Syntax::default())
            .unwrap()
    }

    fn compile_regex(&mut self) {
        // TODO don't panic on invalid regex
        let compiled = Regex::with_options(&self.regex_str,
                                           onig::REGEX_OPTION_CAPTURE_GROUP,
                                           Syntax::default())
            .unwrap();
        self.regex = Some(compiled);
    }

    /// Makes sure the regex is compiled if it doesn't have captures.
    /// May compile the regex if it isn't, panicing if compilation fails.
    #[inline]
    pub fn ensure_compiled_if_possible(&mut self) {
        if self.regex.is_none() && !self.has_captures {
            self.compile_regex();
        }
    }
}

impl Eq for LinkerLink {}

impl PartialEq for LinkerLink {
    fn eq(&self, other: &LinkerLink) -> bool {
        self.link.upgrade() == other.link.upgrade()
    }
}


/// Just panics, we can't do anything with linked up syntaxes
impl Serialize for LinkerLink {
    fn serialize<S>(&self, _: S) -> Result<S::Ok, S::Error> where S: Serializer {
        panic!("Can't serialize syntax definitions which have been linked");
    }
}

/// Just panics, we can't do anything with linked up syntaxes
impl<'de> Deserialize<'de> for LinkerLink {
    fn deserialize<D>(_: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
        panic!("No linked syntax should ever have gotten serialized");
    }
}


/// Serialize the provided map in natural key order, so that it's deterministic when dumping.
fn ordered_map<K, V, S>(map: &HashMap<K, V>, serializer: S) -> Result<S::Ok, S::Error>
    where S: Serializer, K: Eq + Hash + Ord + Serialize, V: Serialize
{
    let ordered: BTreeMap<_, _> = map.iter().collect();
    ordered.serialize(serializer)
}


#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn can_compile_refs() {
        use onig::{self, Regex, Region};
        let pat = MatchPattern {
            has_captures: true,
            regex_str: String::from(r"lol \\ \2 \1 '\9' \wz"),
            regex: None,
            scope: vec![],
            captures: None,
            operation: MatchOperation::None,
            with_prototype: None,
        };
        let r = Regex::new(r"(\\\[\]\(\))(b)(c)(d)(e)").unwrap();
        let mut region = Region::new();
        let s = r"\[]()bcde";
        assert!(r.match_with_options(s, 0, onig::SEARCH_OPTION_NONE, Some(&mut region)).is_some());

        let regex_res = pat.regex_with_substitutes(&region, s);
        assert_eq!(regex_res, r"lol \\ b \\\[\]\(\) '' \wz");
        pat.compile_with_refs(&region, s);
    }
}