Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

# -*- coding: utf-8 -*- 

""" 

    pygments.regexopt 

    ~~~~~~~~~~~~~~~~~ 

 

    An algorithm that generates optimized regexes for matching long lists of 

    literal strings. 

 

    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS. 

    :license: BSD, see LICENSE for details. 

""" 

 

import re 

from re import escape 

from os.path import commonprefix 

from itertools import groupby 

from operator import itemgetter 

 

CS_ESCAPE = re.compile(r'[\^\\\-\]]') 

FIRST_ELEMENT = itemgetter(0) 

 

 

def make_charset(letters): 

    return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' 

 

 

def regex_opt_inner(strings, open_paren): 

    """Return a regex that matches any string in the sorted list of strings.""" 

    close_paren = open_paren and ')' or '' 

    # print strings, repr(open_paren) 

    if not strings: 

        # print '-> nothing left' 

        return '' 

    first = strings[0] 

    if len(strings) == 1: 

        # print '-> only 1 string' 

        return open_paren + escape(first) + close_paren 

    if not first: 

        # print '-> first string empty' 

        return open_paren + regex_opt_inner(strings[1:], '(?:') \ 

            + '?' + close_paren 

    if len(first) == 1: 

        # multiple one-char strings? make a charset 

        oneletter = [] 

        rest = [] 

        for s in strings: 

            if len(s) == 1: 

                oneletter.append(s) 

            else: 

                rest.append(s) 

        if len(oneletter) > 1:  # do we have more than one oneletter string? 

            if rest: 

                # print '-> 1-character + rest' 

                return open_paren + regex_opt_inner(rest, '') + '|' \ 

                    + make_charset(oneletter) + close_paren 

            # print '-> only 1-character' 

            return make_charset(oneletter) 

    prefix = commonprefix(strings) 

    if prefix: 

        plen = len(prefix) 

        # we have a prefix for all strings 

        # print '-> prefix:', prefix 

        return open_paren + escape(prefix) \ 

            + regex_opt_inner([s[plen:] for s in strings], '(?:') \ 

            + close_paren 

    # is there a suffix? 

    strings_rev = [s[::-1] for s in strings] 

    suffix = commonprefix(strings_rev) 

    if suffix: 

        slen = len(suffix) 

        # print '-> suffix:', suffix[::-1] 

        return open_paren \ 

            + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ 

            + escape(suffix[::-1]) + close_paren 

    # recurse on common 1-string prefixes 

    # print '-> last resort' 

    return open_paren + \ 

        '|'.join(regex_opt_inner(list(group[1]), '') 

                 for group in groupby(strings, lambda s: s[0] == first[0])) \ 

        + close_paren 

 

 

def regex_opt(strings, prefix='', suffix=''): 

    """Return a compiled regex that matches any string in the given list. 

 

    The strings to match must be literal strings, not regexes.  They will be 

    regex-escaped. 

 

    *prefix* and *suffix* are pre- and appended to the final regex. 

    """ 

    strings = sorted(strings) 

    return prefix + regex_opt_inner(strings, '(') + suffix