1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
use super::syntax_definition::*;
use super::scope::*;
#[cfg(feature = "yaml-load")]
use super::super::LoadingError;

use std::path::Path;
#[cfg(feature = "yaml-load")]
use walkdir::WalkDir;
#[cfg(feature = "yaml-load")]
use std::io::Read;
use std::io::{self, BufRead, BufReader};
use std::fs::File;
use std::ops::DerefMut;
use std::mem;
use std::rc::Rc;
use std::ascii::AsciiExt;
use std::sync::Mutex;
use onig::Regex;

/// A syntax set holds a bunch of syntaxes and manages
/// loading them and the crucial operation of *linking*.
///
/// Linking replaces the references between syntaxes with direct
/// pointers. See `link_syntaxes` for more.
///
/// Re-linking— linking, adding more unlinked syntaxes with `load_syntaxes`,
/// and then linking again—is allowed.
#[derive(Debug, Serialize, Deserialize)]
pub struct SyntaxSet {
    syntaxes: Vec<SyntaxDefinition>,
    pub is_linked: bool,
    #[serde(skip_serializing, skip_deserializing)]
    first_line_cache: Mutex<FirstLineCache>,
    /// Stores the syntax index for every path that was loaded
    path_syntaxes: Vec<(String, usize)>,
}

#[cfg(feature = "yaml-load")]
fn load_syntax_file(p: &Path,
                    lines_include_newline: bool)
                    -> Result<SyntaxDefinition, LoadingError> {
    let mut f = File::open(p)?;
    let mut s = String::new();
    f.read_to_string(&mut s)?;

    Ok(SyntaxDefinition::load_from_str(&s, lines_include_newline)?)
}

impl Default for SyntaxSet {
    fn default() -> Self {
        SyntaxSet {
            syntaxes: Vec::new(),
            is_linked: true,
            first_line_cache: Mutex::new(FirstLineCache::new()),
            path_syntaxes: Vec::new(),
        }
    }
}

impl SyntaxSet {
    pub fn new() -> SyntaxSet {
        SyntaxSet::default()
    }

    /// Convenience constructor calling `new` and then `load_syntaxes` on the resulting set
    /// defaults to lines given not including newline characters, see the
    /// `load_syntaxes` method docs for an explanation as to why this might not be the best.
    /// It also links all the syntaxes together, see `link_syntaxes` for what that means.
    #[cfg(feature = "yaml-load")]
    pub fn load_from_folder<P: AsRef<Path>>(folder: P) -> Result<SyntaxSet, LoadingError> {
        let mut ps = Self::new();
        ps.load_syntaxes(folder, false)?;
        ps.link_syntaxes();
        Ok(ps)
    }

    /// Loads all the .sublime-syntax files in a folder into this syntax set.
    /// It does not link the syntaxes, in case you want to serialize this syntax set.
    ///
    /// The `lines_include_newline` parameter is used to work around the fact that Sublime Text normally
    /// passes line strings including newline characters (`\n`) to its regex engine. This results in many
    /// syntaxes having regexes matching `\n`, which doesn't work if you don't pass in newlines.
    /// It is recommended that if you can you pass in lines with newlines if you can and pass `true` for this parameter.
    /// If that is inconvenient pass `false` and the loader will do some hacky find and replaces on the
    /// match regexes that seem to work for the default syntax set, but may not work for any other syntaxes.
    ///
    /// In the future I might include a "slow mode" that copies the lines passed in and appends a newline if there isn't one.
    /// but in the interest of performance currently this hacky fix will have to do.
    #[cfg(feature = "yaml-load")]
    pub fn load_syntaxes<P: AsRef<Path>>(&mut self,
                                         folder: P,
                                         lines_include_newline: bool)
                                         -> Result<(), LoadingError> {
        self.is_linked = false;
        for entry in WalkDir::new(folder).sort_by(|a, b| a.cmp(b)) {
            let entry = entry.map_err(LoadingError::WalkDir)?;
            if entry.path().extension().map_or(false, |e| e == "sublime-syntax") {
                // println!("{}", entry.path().display());
                let syntax = load_syntax_file(entry.path(), lines_include_newline)?;
                if let Some(path_str) = entry.path().to_str() {
                    self.path_syntaxes.push((path_str.to_string(), self.syntaxes.len()));
                }
                self.syntaxes.push(syntax);
            }
        }
        Ok(())
    }

    /// Add a syntax to the set. If the set was linked it is now only partially linked
    /// and you'll have to link it again for full linking.
    pub fn add_syntax(&mut self, syntax: SyntaxDefinition) {
        self.is_linked = false;
        self.syntaxes.push(syntax);
    }

    /// The list of syntaxes in the set
    pub fn syntaxes(&self) -> &[SyntaxDefinition] {
        &self.syntaxes[..]
    }

    /// Rarely useful method that loads in a syntax with no highlighting rules for plain text.
    /// Exists mainly for adding the plain text syntax to syntax set dumps, because for some
    /// reason the default Sublime plain text syntax is still in `.tmLanguage` format.
    #[cfg(feature = "yaml-load")]
    pub fn load_plain_text_syntax(&mut self) {
        let s = "---\nname: Plain Text\nfile_extensions: [txt]\nscope: text.plain\ncontexts: \
                 {main: []}";
        let syn = SyntaxDefinition::load_from_str(s, false).unwrap();
        self.syntaxes.push(syn);
    }

    /// Finds a syntax by its default scope, for example `source.regexp` finds the regex syntax.
    /// This and all similar methods below do a linear search of syntaxes, this should be fast
    /// because there aren't many syntaxes, but don't think you can call it a bajillion times per second.
    pub fn find_syntax_by_scope(&self, scope: Scope) -> Option<&SyntaxDefinition> {
        self.syntaxes.iter().find(|&s| s.scope == scope)
    }

    pub fn find_syntax_by_name<'a>(&'a self, name: &str) -> Option<&'a SyntaxDefinition> {
        self.syntaxes.iter().find(|&s| name == &s.name)
    }

    pub fn find_syntax_by_extension<'a>(&'a self, extension: &str) -> Option<&'a SyntaxDefinition> {
        self.syntaxes.iter().find(|&s| s.file_extensions.iter().any(|e| e == extension))
    }

    /// Searches for a syntax first by extension and then by case-insensitive name
    /// useful for things like Github-flavoured-markdown code block highlighting where
    /// all you have to go on is a short token given by the user
    pub fn find_syntax_by_token<'a>(&'a self, s: &str) -> Option<&'a SyntaxDefinition> {
        {
            let ext_res = self.find_syntax_by_extension(s);
            if ext_res.is_some() {
                return ext_res;
            }
        }
        let lower = s.to_ascii_lowercase();
        self.syntaxes.iter().find(|&s| lower == s.name.to_ascii_lowercase())
    }

    /// Try to find the syntax for a file based on its first line.
    /// This uses regexes that come with some sublime syntax grammars
    /// for matching things like shebangs and mode lines like `-*- Mode: C -*-`
    pub fn find_syntax_by_first_line<'a>(&'a self, s: &str) -> Option<&'a SyntaxDefinition> {
        let mut cache = self.first_line_cache.lock().unwrap();
        cache.ensure_filled(self.syntaxes());
        for &(ref reg, i) in &cache.regexes {
            if reg.find(s).is_some() {
                return Some(&self.syntaxes[i]);
            }
        }
        None
    }

    /// Searches for a syntax by it's original file path when it was first loaded from disk
    /// primarily useful for syntax tests
    /// some may specify a Packages/PackageName/SyntaxName.sublime-syntax path
    /// others may just have SyntaxName.sublime-syntax
    /// this caters for these by matching the end of the path of the loaded syntax definition files
    // however, if a syntax name is provided without a folder, make sure we don't accidentally match the end of a different syntax definition's name - by checking a / comes before it or it is the full path
    pub fn find_syntax_by_path<'a>(&'a self, path: &str) -> Option<&'a SyntaxDefinition> {
        let mut slash_path = "/".to_string();
        slash_path.push_str(&path);
        return self.path_syntaxes.iter().find(|t| t.0.ends_with(&slash_path) || t.0 == path).map(|&(_,i)| &self.syntaxes[i]);
    }

    /// Convenience method that tries to find the syntax for a file path,
    /// first by extension and then by first line of the file if that doesn't work.
    /// May IO Error because it sometimes tries to read the first line of the file.
    ///
    /// # Examples
    /// When determining how to highlight a file, use this in combination with a fallback to plain text:
    ///
    /// ```
    /// use syntect::parsing::SyntaxSet;
    /// let ss = SyntaxSet::load_defaults_nonewlines();
    /// let syntax = ss.find_syntax_for_file("testdata/highlight_test.erb")
    ///     .unwrap() // for IO errors, you may want to use try!() or another plain text fallback
    ///     .unwrap_or_else(|| ss.find_syntax_plain_text());
    /// assert_eq!(syntax.name, "HTML (Rails)");
    /// ```
    pub fn find_syntax_for_file<P: AsRef<Path>>(&self,
                                                path_obj: P)
                                                -> io::Result<Option<&SyntaxDefinition>> {
        let path: &Path = path_obj.as_ref();
        let extension = path.extension().and_then(|x| x.to_str()).unwrap_or("");
        let ext_syntax = self.find_syntax_by_extension(extension);
        let line_syntax = if ext_syntax.is_none() {
            let mut line = String::new();
            let f = File::open(path)?;
            let mut line_reader = BufReader::new(&f);
            line_reader.read_line(&mut line)?;
            self.find_syntax_by_first_line(&line)
        } else {
            None
        };
        let syntax = ext_syntax.or(line_syntax);
        Ok(syntax)
    }

    /// Finds a syntax for plain text, which usually has no highlighting rules.
    /// Good as a fallback when you can't find another syntax but you still want
    /// to use the same highlighting pipeline code.
    ///
    /// This syntax should always be present, if not this method will panic.
    /// If the way you load syntaxes doesn't create one, use `load_plain_text_syntax`.
    ///
    /// # Examples
    /// ```
    /// use syntect::parsing::SyntaxSet;
    /// let mut ss = SyntaxSet::new();
    /// ss.load_plain_text_syntax();
    /// let syntax = ss.find_syntax_by_token("rs").unwrap_or_else(|| ss.find_syntax_plain_text());
    /// assert_eq!(syntax.name, "Plain Text");
    /// ```
    pub fn find_syntax_plain_text(&self) -> &SyntaxDefinition {
        self.find_syntax_by_name("Plain Text")
            .expect("All syntax sets ought to have a plain text syntax")
    }

    /// This links all the syntaxes in this set directly with pointers for performance purposes.
    /// It is necessary to do this before parsing anything with these syntaxes.
    /// However, it is not possible to serialize a syntax set that has been linked,
    /// which is why it isn't done by default, except by the load_from_folder constructor.
    /// This operation is idempotent, but takes time even on already linked syntax sets.
    pub fn link_syntaxes(&mut self) {
        // 2 loops necessary to satisfy borrow checker :-(
        for syntax in &mut self.syntaxes {
            if let Some(proto_ptr) = syntax.contexts.get("prototype") {
                Self::recursively_mark_no_prototype(syntax, proto_ptr.clone());
                syntax.prototype = Some((*proto_ptr).clone());
            }
        }
        for syntax in &self.syntaxes {
            for context_ptr in syntax.contexts.values() {
                let mut mut_ref = context_ptr.borrow_mut();
                self.link_context(syntax, mut_ref.deref_mut());
            }
        }
        self.is_linked = true;
    }

    /// Anything recursively included by the prototype shouldn't include the prototype.
    /// This marks them as such.
    fn recursively_mark_no_prototype(syntax: &SyntaxDefinition, context_ptr: ContextPtr) {
        if let Ok(mut mut_ref) = context_ptr.try_borrow_mut() {
            let context = mut_ref.deref_mut();
            context.meta_include_prototype = false;
            for pattern in &mut context.patterns {
                match *pattern {
                    // Apparently inline blocks also don't include the prototype when within the prototype.
                    // This is really weird, but necessary to run the YAML syntax.
                    Pattern::Match(ref mut match_pat) => {
                        let maybe_context_refs = match match_pat.operation {
                            MatchOperation::Push(ref context_refs) |
                            MatchOperation::Set(ref context_refs) => Some(context_refs),
                            MatchOperation::Pop | MatchOperation::None => None,
                        };
                        if let Some(context_refs) = maybe_context_refs {
                            for context_ref in context_refs.iter() {
                                if let ContextReference::Inline(ref context_ptr) = *context_ref {
                                    Self::recursively_mark_no_prototype(syntax, context_ptr.clone());
                                }
                            }
                        }
                    }
                    Pattern::Include(ContextReference::Named(ref s)) => {
                        if let Some(context_ptr) = syntax.contexts.get(s) {
                            Self::recursively_mark_no_prototype(syntax, context_ptr.clone());
                        }
                    }
                    _ => (),
                }
            }
        }
    }

    fn link_context(&self, syntax: &SyntaxDefinition, context: &mut Context) {
        if context.meta_include_prototype {
            if let Some(ref proto_ptr) = syntax.prototype {
                context.prototype = Some((*proto_ptr).clone());
            }
        }
        for pattern in &mut context.patterns {
            match *pattern {
                Pattern::Match(ref mut match_pat) => self.link_match_pat(syntax, match_pat),
                Pattern::Include(ref mut context_ref) => self.link_ref(syntax, context_ref),
            }
        }
    }

    fn link_ref(&self, syntax: &SyntaxDefinition, context_ref: &mut ContextReference) {
        // println!("{:?}", context_ref);
        use super::syntax_definition::ContextReference::*;
        let maybe_new_context = match *context_ref {
            Named(ref s) => {
                // This isn't actually correct, but it is better than nothing/crashing.
                // This is being phased out anyhow, see https://github.com/sublimehq/Packages/issues/73
                // Fixes issue #30
                if s == "$top_level_main" {
                    syntax.contexts.get("main")
                } else {
                    syntax.contexts.get(s)
                }
            }
            Inline(ref context_ptr) => {
                let mut mut_ref = context_ptr.borrow_mut();
                self.link_context(syntax, mut_ref.deref_mut());
                None
            }
            ByScope { scope, ref sub_context } => {
                let other_syntax = self.find_syntax_by_scope(scope);
                let context_name = sub_context.as_ref().map_or("main", |x| &**x);
                other_syntax.and_then(|s| s.contexts.get(context_name))
            }
            File { ref name, ref sub_context } => {
                let other_syntax = self.find_syntax_by_name(name);
                let context_name = sub_context.as_ref().map_or("main", |x| &**x);
                other_syntax.and_then(|s| s.contexts.get(context_name))
            }
            Direct(_) => None,
        };
        if let Some(new_context) = maybe_new_context {
            let mut new_ref = Direct(LinkerLink { link: Rc::downgrade(new_context) });
            mem::swap(context_ref, &mut new_ref);
        }
    }

    fn link_match_pat(&self, syntax: &SyntaxDefinition, match_pat: &mut MatchPattern) {
        let maybe_context_refs = match match_pat.operation {
            MatchOperation::Push(ref mut context_refs) |
            MatchOperation::Set(ref mut context_refs) => Some(context_refs),
            MatchOperation::Pop | MatchOperation::None => None,
        };
        if let Some(context_refs) = maybe_context_refs {
            for context_ref in context_refs.iter_mut() {
                self.link_ref(syntax, context_ref);
            }
        }
        if let Some(ref context_ptr) = match_pat.with_prototype {
            let mut mut_ref = context_ptr.borrow_mut();
            self.link_context(syntax, mut_ref.deref_mut());
        }
    }
}

#[derive(Debug)]
struct FirstLineCache {
    /// (first line regex, syntax index) pairs for all syntaxes with a first line regex
    /// built lazily on first use of `find_syntax_by_first_line`.
    regexes: Vec<(Regex, usize)>,
    /// To what extent the first line cache has been built
    cached_until: usize,
}

impl Default for FirstLineCache {
    fn default() -> Self {
        FirstLineCache {
            regexes: Vec::new(),
            cached_until: 0,
        }
    }
}

impl FirstLineCache {
    fn new() -> FirstLineCache {
        FirstLineCache::default()
    }

    fn ensure_filled(&mut self, syntaxes: &[SyntaxDefinition]) {
        if self.cached_until >= syntaxes.len() {
            return;
        }

        for (i, syntax) in syntaxes[self.cached_until..].iter().enumerate() {
            if let Some(ref reg_str) = syntax.first_line_match {
                if let Ok(reg) = Regex::new(reg_str) {
                    self.regexes.push((reg, i));
                }
            }
        }

        self.cached_until = syntaxes.len();
    }
}


#[cfg(feature = "yaml-load")]
#[cfg(test)]
mod tests {
    use super::*;
    use parsing::{Scope, syntax_definition};
    #[test]
    fn can_load() {
        let mut ps = SyntaxSet::load_from_folder("testdata/Packages").unwrap();
        assert_eq!(&ps.find_syntax_by_first_line("#!/usr/bin/env node").unwrap().name,
                   "JavaScript");
        ps.load_plain_text_syntax();
        let rails_scope = Scope::new("source.ruby.rails").unwrap();
        let syntax = ps.find_syntax_by_name("Ruby on Rails").unwrap();
        ps.find_syntax_plain_text();
        assert_eq!(&ps.find_syntax_by_extension("rake").unwrap().name, "Ruby");
        assert_eq!(&ps.find_syntax_by_token("ruby").unwrap().name, "Ruby");
        assert_eq!(&ps.find_syntax_by_first_line("lol -*- Mode: C -*- such line").unwrap().name,
                   "C");
        assert_eq!(&ps.find_syntax_for_file("testdata/parser.rs").unwrap().unwrap().name,
                   "Rust");
        assert_eq!(&ps.find_syntax_for_file("testdata/test_first_line.test")
                       .unwrap()
                       .unwrap()
                       .name,
                   "Go");
        assert!(&ps.find_syntax_by_first_line("derp derp hi lol").is_none());
        assert_eq!(&ps.find_syntax_by_path("Packages/Rust/Rust.sublime-syntax").unwrap().name,
                   "Rust");
        // println!("{:#?}", syntax);
        assert_eq!(syntax.scope, rails_scope);
        // assert!(false);
        let main_context = syntax.contexts.get("main").unwrap();
        let count = syntax_definition::context_iter(main_context.clone()).count();
        assert_eq!(count, 109);
    }
}