/usr/share/common-lisp/source/cl-ppcre/parser.lisp is in cl-ppcre 2.0.3-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.31 2009/09/17 19:17:31 edi Exp $
;;; The parser will - with the help of the lexer - parse a regex
;;; string and convert it into a "parse tree" (see docs for details
;;; about the syntax of these trees). Note that the lexer might
;;; return illegal parse trees. It is assumed that the conversion
;;; process later on will track them down.
;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:
;;; * Redistributions of source code must retain the above copyright
;;; notice, this list of conditions and the following disclaimer.
;;; * Redistributions in binary form must reproduce the above
;;; copyright notice, this list of conditions and the following
;;; disclaimer in the documentation and/or other materials
;;; provided with the distribution.
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(in-package :cl-ppcre)
(defun group (lexer)
"Parses and consumes a <group>.
The productions are: <group> -> \"\(\"<regex>\")\"
\"\(?:\"<regex>\")\"
\"\(?>\"<regex>\")\"
\"\(?<flags>:\"<regex>\")\"
\"\(?=\"<regex>\")\"
\"\(?!\"<regex>\")\"
\"\(?<=\"<regex>\")\"
\"\(?<!\"<regex>\")\"
\"\(?\(\"<num>\")\"<regex>\")\"
\"\(?\(\"<regex>\")\"<regex>\")\"
\"\(?<name>\"<regex>\")\" \(when *ALLOW-NAMED-REGISTERS* is T)
<legal-token>
where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
<grouping-type> is one of six keywords - see source for details."
(declare #.*standard-optimize-settings*)
(multiple-value-bind (open-token flags)
(get-token lexer)
(cond ((eq open-token :open-paren-paren)
;; special case for conditional regular expressions; note
;; that at this point we accept a couple of illegal
;; combinations which'll be sorted out later by the
;; converter
(let* ((open-paren-pos (car (lexer-last-pos lexer)))
;; check if what follows "(?(" is a number
(number (try-number lexer :no-whitespace-p t))
;; make changes to extended-mode-p local
(*extended-mode-p* *extended-mode-p*))
(declare (fixnum open-paren-pos))
(cond (number
;; condition is a number (i.e. refers to a
;; back-reference)
(let* ((inner-close-token (get-token lexer))
(reg-expr (reg-expr lexer))
(close-token (get-token lexer)))
(unless (eq inner-close-token :close-paren)
(signal-syntax-error* (+ open-paren-pos 2)
"Opening paren has no matching closing paren."))
(unless (eq close-token :close-paren)
(signal-syntax-error* open-paren-pos
"Opening paren has no matching closing paren."))
(list :branch number reg-expr)))
(t
;; condition must be a full regex (actually a
;; look-behind or look-ahead); and here comes a
;; terrible kludge: instead of being cleanly
;; separated from the lexer, the parser pushes
;; back the lexer by one position, thereby
;; landing in the middle of the 'token' "(?(" -
;; yuck!!
(decf (lexer-pos lexer))
(let* ((inner-reg-expr (group lexer))
(reg-expr (reg-expr lexer))
(close-token (get-token lexer)))
(unless (eq close-token :close-paren)
(signal-syntax-error* open-paren-pos
"Opening paren has no matching closing paren."))
(list :branch inner-reg-expr reg-expr))))))
((member open-token '(:open-paren
:open-paren-colon
:open-paren-greater
:open-paren-equal
:open-paren-exclamation
:open-paren-less-equal
:open-paren-less-exclamation
:open-paren-less-letter)
:test #'eq)
;; make changes to extended-mode-p local
(let ((*extended-mode-p* *extended-mode-p*))
;; we saw one of the six token representing opening
;; parentheses
(let* ((open-paren-pos (car (lexer-last-pos lexer)))
(register-name (when (eq open-token :open-paren-less-letter)
(parse-register-name-aux lexer)))
(reg-expr (reg-expr lexer))
(close-token (get-token lexer)))
(when (or (eq open-token :open-paren)
(eq open-token :open-paren-less-letter))
;; if this is the "("<regex>")" or "(?"<name>""<regex>")" production we have to
;; increment the register counter of the lexer
(incf (lexer-reg lexer)))
(unless (eq close-token :close-paren)
;; the token following <regex> must be the closing
;; parenthesis or this is a syntax error
(signal-syntax-error* open-paren-pos
"Opening paren has no matching closing paren."))
(if flags
;; if the lexer has returned a list of flags this must
;; have been the "(?:"<regex>")" production
(cons :group (nconc flags (list reg-expr)))
(if (eq open-token :open-paren-less-letter)
(list :named-register
;; every string was reversed, so we have to
;; reverse it back to get the name
(nreverse register-name)
reg-expr)
(list (case open-token
((:open-paren)
:register)
((:open-paren-colon)
:group)
((:open-paren-greater)
:standalone)
((:open-paren-equal)
:positive-lookahead)
((:open-paren-exclamation)
:negative-lookahead)
((:open-paren-less-equal)
:positive-lookbehind)
((:open-paren-less-exclamation)
:negative-lookbehind))
reg-expr))))))
(t
;; this is the <legal-token> production; <legal-token> is
;; any token which passes START-OF-SUBEXPR-P (otherwise
;; parsing had already stopped in the SEQ method)
open-token))))
(defun greedy-quant (lexer)
"Parses and consumes a <greedy-quant>.
The productions are: <greedy-quant> -> <group> | <group><quantifier>
where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
(declare #.*standard-optimize-settings*)
(let* ((group (group lexer))
(token (get-quantifier lexer)))
(if token
;; if GET-QUANTIFIER returned a non-NIL value it's the
;; two-element list (<min> <max>)
(list :greedy-repetition (first token) (second token) group)
group)))
(defun quant (lexer)
"Parses and consumes a <quant>.
The productions are: <quant> -> <greedy-quant> | <greedy-quant>\"?\".
Will return the <parse-tree> returned by GREEDY-QUANT and optionally
change :GREEDY-REPETITION to :NON-GREEDY-REPETITION."
(declare #.*standard-optimize-settings*)
(let* ((greedy-quant (greedy-quant lexer))
(pos (lexer-pos lexer))
(next-char (next-char lexer)))
(when next-char
(if (char= next-char #\?)
(setf (car greedy-quant) :non-greedy-repetition)
(setf (lexer-pos lexer) pos)))
greedy-quant))
(defun seq (lexer)
"Parses and consumes a <seq>.
The productions are: <seq> -> <quant> | <quant><seq>.
Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
(declare #.*standard-optimize-settings*)
(flet ((make-array-from-two-chars (char1 char2)
(let ((string (make-array 2
:element-type 'character
:fill-pointer t
:adjustable t)))
(setf (aref string 0) char1)
(setf (aref string 1) char2)
string)))
;; Note that we're calling START-OF-SUBEXPR-P before we actually try
;; to parse a <seq> or <quant> in order to catch empty regular
;; expressions
(if (start-of-subexpr-p lexer)
(let ((quant (quant lexer)))
(if (start-of-subexpr-p lexer)
(let* ((seq (seq lexer))
(quant-is-char-p (characterp quant))
(seq-is-sequence-p (and (consp seq)
(eq (first seq) :sequence))))
(cond ((and quant-is-char-p
(characterp seq))
(make-array-from-two-chars seq quant))
((and quant-is-char-p
(stringp seq))
(vector-push-extend quant seq)
seq)
((and quant-is-char-p
seq-is-sequence-p
(characterp (second seq)))
(cond ((cddr seq)
(setf (cdr seq)
(cons
(make-array-from-two-chars (second seq)
quant)
(cddr seq)))
seq)
(t (make-array-from-two-chars (second seq) quant))))
((and quant-is-char-p
seq-is-sequence-p
(stringp (second seq)))
(cond ((cddr seq)
(setf (cdr seq)
(cons
(progn
(vector-push-extend quant (second seq))
(second seq))
(cddr seq)))
seq)
(t
(vector-push-extend quant (second seq))
(second seq))))
(seq-is-sequence-p
;; if <seq> is also a :SEQUENCE parse tree we merge
;; both lists into one to avoid unnecessary consing
(setf (cdr seq)
(cons quant (cdr seq)))
seq)
(t (list :sequence quant seq))))
quant))
:void)))
(defun reg-expr (lexer)
"Parses and consumes a <regex>, a complete regular expression.
The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
(declare #.*standard-optimize-settings*)
(let ((pos (lexer-pos lexer)))
(case (next-char lexer)
((nil)
;; if we didn't get any token we return :VOID which stands for
;; "empty regular expression"
:void)
((#\|)
;; now check whether the expression started with a vertical
;; bar, i.e. <seq> - the left alternation - is empty
(list :alternation :void (reg-expr lexer)))
(otherwise
;; otherwise un-read the character we just saw and parse a
;; <seq> plus the character following it
(setf (lexer-pos lexer) pos)
(let* ((seq (seq lexer))
(pos (lexer-pos lexer)))
(case (next-char lexer)
((nil)
;; no further character, just a <seq>
seq)
((#\|)
;; if the character was a vertical bar, this is an
;; alternation and we have the second production
(let ((reg-expr (reg-expr lexer)))
(cond ((and (consp reg-expr)
(eq (first reg-expr) :alternation))
;; again we try to merge as above in SEQ
(setf (cdr reg-expr)
(cons seq (cdr reg-expr)))
reg-expr)
(t (list :alternation seq reg-expr)))))
(otherwise
;; a character which is not a vertical bar - this is
;; either a syntax error or we're inside of a group and
;; the next character is a closing parenthesis; so we
;; just un-read the character and let another function
;; take care of it
(setf (lexer-pos lexer) pos)
seq)))))))
(defun reverse-strings (parse-tree)
"Recursively walks through PARSE-TREE and destructively reverses all
strings in it."
(declare #.*standard-optimize-settings*)
(cond ((stringp parse-tree)
(nreverse parse-tree))
((consp parse-tree)
(loop for parse-tree-rest on parse-tree
while parse-tree-rest
do (setf (car parse-tree-rest)
(reverse-strings (car parse-tree-rest))))
parse-tree)
(t parse-tree)))
(defun parse-string (string)
"Translate the regex string STRING into a parse tree."
(declare #.*standard-optimize-settings*)
(let* ((lexer (make-lexer string))
(parse-tree (reverse-strings (reg-expr lexer))))
;; check whether we've consumed the whole regex string
(if (end-of-string-p lexer)
parse-tree
(signal-syntax-error* (lexer-pos lexer) "Expected end of string."))))
|