1 ;;; xpath-parser.el --- XPATH parser
3 ;; Copyright (C) 2001 Alex Schroeder <alex@gnu.org>
5 ;; Author: Alex Schroeder <alex@gnu.org>
6 ;; Maintainer: Oliver Scholz <epameinondas@gmx.de>
9 ;; URL: http://www.emacswiki.org/cgi-bin/wiki.pl?XmlParser
10 ;; Version: $Id: xpath-parser.el,v 1.6 2003/12/16 00:27:36 egoge Exp egoge $
12 ;; This file is not part of GNU Emacs.
14 ;; This is free software; you can redistribute it and/or modify it under
15 ;; the terms of the GNU General Public License as published by the Free
16 ;; Software Foundation; either version 2, or (at your option) any later
19 ;; This is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs; see the file COPYING. If not, write to the
26 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 ;; Boston, MA 02111-1307, USA.
31 ;; Used by xpath.el, tables created automatically from xpath.bnf. The
32 ;; main entry points are `xpath-lex-string' and `xpath-lex-region'.
33 ;; These two functions prepare a list of preliminary tokens and store
34 ;; them in the variable `xpath-token-input'. Next, call `wisent-parse'
35 ;; using `xpath-tables' and `xpath-pop-input' and an error function of
36 ;; your liking: (wisent-parse xpath-tables #'xpath-pop-input #'error)
38 ;; `wisent-parse' then returns a list of elements STEP. Each STEP has
39 ;; the form (TEST PREDICATE). Both TEST and PREDICATE have the form
40 ;; (FUNC PARAMS...). FUNC is always a function which must accept all
41 ;; the PARAMS as arguments, plus a node. The TEST FUNC must then return
42 ;; a list of nodes, the PREDICATE must return either nil or non-nil.
43 ;; The PREDICATE is used for filtering the list returned by TEST FUNC.
45 ;; See xpath.el for more information on all the functions used.
50 (eval-when-compile (require 'wisent-comp))
52 ;; (setq wisent-verbose-flag t)
53 (defconst xpath-document-root-symbol
54 (make-symbol "document-root")
55 "Symbol used to indicate the document root.
56 This is used to specify that a query should start from the owner
57 document. This is necessary for absolute location paths.")
60 (wisent-compile-grammar
61 '((NCNAME LITERAL NUMBER VARIABLEREFERENCE
63 COLON AXISSUF DOTDOT LT GT LE GE NE STAR PLUS MINUS SLASH
64 UNION LPAREN RPAREN LBRACK RBRACK AT DOT EQ COMMA
66 ANCESTOR ANCESTOR-OR-SELF ATTRIBUTE CHILD DESCENDANT
67 DESCENDANT-OR-SELF FOLLOWING FOLLOWING-SIBLING NAMESPACE
68 PARENT PRECEDING PRECEDING-SIBLING SELF)
73 ((RelativeLocationPath))
74 ((AbsoluteLocationPath)))
77 ((SLASH RelativeLocationPath)
78 (append (list xpath-document-root-symbol) $2))
79 ((AbbreviatedAbsoluteLocationPath)))
82 ((RelativeLocationPath SLASH Step)
84 ((AbbreviatedRelativeLocationPath)))
92 ((predicates Predicate)
95 ((AxisName AXISSUF NodeTest)
98 (list 'xpath-child-axis $1)))
101 'xpath-ancestor-axis)
103 'xpath-ancestor-or-self-axis)
105 'xpath-attribute-axis)
109 'xpath-descendant-axis)
110 ((DESCENDANT-OR-SELF)
111 'xpath-descendant-or-self-axis)
113 'xpath-following-axis)
115 'xpath-following-sibling-axis)
117 'xpath-namespace-axis)
121 'xpath-preceding-axis)
128 (list 'xpath-name-filter $1))
129 ((NODETYPE LPAREN Arglist RPAREN)
130 (list 'xpath-node-type-filter $1))
131 ;; ((PROCESSING-INSTRUCTION LPAREN LITERAL RPAREN))
134 ((LBRACK PredicateExpr RBRACK)
138 (AbbreviatedAbsoluteLocationPath
139 ((SLASH SLASH RelativeLocationPath)))
140 (AbbreviatedRelativeLocationPath
141 ((RelativeLocationPath SLASH SLASH Step)))
151 ((VARIABLEREFERENCE))
152 ((LPAREN Expr RPAREN))
157 ((FUNCTIONNAME LPAREN Arglist RPAREN)
159 (list (intern (concat "xpath-function/" $1)))
163 ;; 'xpath-position-function)
165 ;; 'xpath-last-function)
167 ;; 'xpath-count-function)
169 ;; 'xpath-name-function))
176 ((Arguments COMMA Argument)
183 ((UnionExpr UNION PathExpr)))
186 (list 'xpath-resolve-steps 'xpath-context-node
189 ((FilterExpr SLASH RelativeLocationPath))
190 ((FilterExpr SLASH SLASH RelativeLocationPath)))
193 ((FilterExpr Predicate)))
196 ((OrExpr OR AndExpr)))
199 ((AndExpr AND EqualityExpr)))
202 ((EqualityExpr EQ RelationalExpr)
203 (list 'xpath-equal $1 $3))
204 ((EqualityExpr NE RelationalExpr)))
207 ((RelationalExpr LT AdditiveExpr))
208 ((RelationalExpr GT AdditiveExpr))
209 ((RelationalExpr LE AdditiveExpr))
210 ((RelationalExpr GE AdditiveExpr)))
212 ((MultiplicativeExpr))
213 ((AdditiveExpr PLUS MultiplicativeExpr))
214 ((AdditiveExpr MINUS MultiplicativeExpr)))
217 ((MultiplicativeExpr STAR UnaryExpr))
218 ((MultiplicativeExpr DIV UnaryExpr))
219 ((MultiplicativeExpr MOD UnaryExpr)))
225 ((NCNAME COLON STAR))
226 ((NCNAME COLON NCNAME))
231 ;; ((PROCESSING-INSTRUCTION))
237 "Parser automaton for XPath.")
240 (defconst xpath-operator-names
246 (defconst xpath-other-operators
259 (defvar xpath-other-tokens
260 '((":" . COLON) ; CAVEAT: to resolve QNames
261 ("*" . STAR) ; CAVEAT
281 (defconst xpath-node-types
282 '(("comment" . COMMENT)
284 ("processing-instruction" . PROCESSING-INSTRUCTION)
288 '(("ancestor" . ANCESTOR)
289 ("ancestor-or-self" . ANCESTOR-OR-SELF)
290 ("attribute" . ATTRIBUTE)
292 ("descendant" . DESCENDANT)
293 ("descendant-or-self" . DESCENDANT-OR-SELF)
294 ("following" . FOLLOWING)
295 ("following-sibling" . FOLLOWING-SIBLING)
296 ("namespace" . NAMESPACE)
298 ("preceding" . PRECEDING)
299 ("preceding-sibling" . PRECEDING-SIBLING)
301 ) ; End: `eval-and-compile'
303 (defconst xpath-lexer-obarray
304 (let ((xpath-obarray (make-vector 13 0)))
305 ;; We need this only for non-letter tokens, because we return a
306 ;; letter-keyword (like an axis name) by interning the match-string.
307 (dolist (elt (append xpath-other-tokens xpath-other-operators))
308 (set (intern (car elt) xpath-obarray) (cdr elt)))
310 "Obarray to lookup some token classes.")
312 (defsubst xpath-lookup-token (str)
313 "Return the tokenclass of token string STR."
314 (symbol-value (intern str xpath-lexer-obarray)))
317 (defconst xpath-ncname-rx
318 `(and (or letter ?_) (zero-or-more (or letter digit ?. ?- ?_)))
319 "Symbolic regexp matching NCnames."))
321 (defconst xpath-ncname-regexp
323 (rx-to-string xpath-ncname-rx))
324 "Regexp matching NCNames.")
326 (defconst xpath-number-regexp
327 (rx (or (and (one-or-more digit)
328 (optional (and ?. (zero-or-more digit))))
329 (and ?. (one-or-more digit))))
330 "Regexp matching numbers.")
332 (defconst xpath-variable-reference-regexp
334 (rx-to-string `(and ?$
335 (optional (and ,xpath-ncname-rx
338 "Regexp matching VariableReferences.")
340 (defsubst xpath-lex-advance-and-return (token &optional return step)
341 "Move forward and return the token as appropriate for parser.
342 This function assumes that the match-data are set appropriately.
344 See `xpath-next-token' for a description of the format of the
347 RETURN is the number of the submatch which determines parts of
348 the value returned. It defaults to 0. STEP is the submatch to
349 whose end point will move, it defaults to the value of RETURN."
350 (or return (setq return 0))
351 (goto-char (match-end (or step return)))
352 (let ((str (match-string-no-properties return)))
356 (cons (match-beginning return)
357 (match-end return)))))
359 (defsubst xpath-operator-allowed-p (preceding)
360 "Return non-nil, if an OPERATOR token is allowed at this point.
361 See XPath specification section 3.7, bullet point #1.
362 PRECEDING is a symbol, the last token previously returned by the
367 (append '(AT DOTDOT LPAREN LBRACK COMMA)
369 (append xpath-operator-names
370 xpath-other-operators
373 (defsubst xpath-lex-resolve-ncname (str beg end preceding)
374 "Return the appropriate token value for NCName string STR.
376 There are special lexical conventions for OperatorName,
377 FunctionName, NodeType and AxisName. This function resolves these
378 conventions. See XPath specification section 3.7, bullet points
381 BEG and END are the begin and end of STR in the buffer
382 respectively. PRECEDING is the last token class previously
383 returned by the lexer.
385 See `xpath-next-token' for a description of the format of the
390 ((and (member str (eval-when-compile
391 (mapcar 'car xpath-operator-names)))
392 (xpath-operator-allowed-p preceding))
393 (setq token (intern (upcase str))))
394 ;; FunctionName or NodeType
395 ((eq (char-after) ?\()
397 (if (member str (eval-when-compile
398 (mapcar 'car xpath-node-types)))
403 (setq token (intern (upcase str))))
405 (t (setq token 'NCNAME)))
406 (nconc (list token str) (cons beg end))))
408 (defun xpath-next-token (preceding)
409 "Return the next XPath token from the buffer.
411 PRECEDING should be the last token-class returned by this
412 function on a previous call or nil.
414 The return value is a list of the form
415 \(TOKEN-CLASS VALUE START . END)
416 as a Wisent parser automaton expects it from its lexing
418 (skip-chars-forward "\x20\x9\xd\xa") ; ExprWhitespace
422 (list wisent-eoi-term))
424 ;; '*', other tokens like '(', other operators like '/'.
425 ;; See XPath spec section 3.7: `ExprToken' and `Operator'.
427 ;; The question whether a '*' is a `MultiplyOperator' or part of a
428 ;; `NodeTest' is resolved by the Wisent parser automaton.
429 ((looking-at (eval-when-compile
430 (regexp-opt (mapcar 'car
431 (append xpath-other-operators
432 xpath-other-tokens)))))
433 (xpath-lex-advance-and-return (xpath-lookup-token (match-string 0))))
435 ;; NCName: AxisName, NodeType, FunctionName or NameTest.
437 ;; We deal with `NodeTest's in the parser automaton. Therefore we
438 ;; have a token class for NCNames: NCNAME. We return this class if
439 ;; the NCName at hand is neither AxisName nor NodeType nor
441 ((looking-at xpath-ncname-regexp)
442 (goto-char (match-end 0))
443 (skip-chars-forward "\x20\x9\xd\xa") ; ExprWhitespace
444 (xpath-lex-resolve-ncname (match-string 0)
450 ((looking-at (rx (or (and ?\" (submatch (zero-or-more (not (any "\"")))) ?\")
451 (and ?\' (submatch (zero-or-more (not (any "\'")))) ?\'))))
452 (xpath-lex-advance-and-return 'LITERAL 1 0))
455 ((looking-at xpath-number-regexp)
456 (xpath-lex-advance-and-return 'NUMBER))
459 ((looking-at xpath-variable-reference-regexp)
460 (xpath-lex-advance-and-return 'VARIABLEREFERENCE))
462 (t (error "Could not reckognize token: %s"
463 (buffer-substring (point) (point-max))))))
466 (defun xpath-steps (str)
467 "Return the XPATH steps for string STR."
470 (goto-char (point-min))
472 (wisent-parse xpath-tables
474 (let ((token (xpath-next-token last-token)))
475 (setq last-token (car token))
480 ;; (defvar egoge-test nil)
481 ;; (defun egoge-test ()
483 ;; (let ((token (xpath-next-token egoge-test)))
484 ;; (setq egoge-test (car token))
487 ;; (xpath-test-lex-string "node1/node2")
488 ;; (xpath-steps "child::node1/child::node2")
489 ;; (xpath-test-lex-string "/node1")
490 ;; (xpath-steps "/node1")
491 ;; (xpath-steps "node1/node2")
492 ;; (xpath-steps "child::para[position()=2]")
494 (defmacro xpath-assert (expr)
496 (error "Test failed: %S" ',expr)))
498 (defun xpath-test-lex-string (str)
501 (goto-char (point-min))
502 (let (last-token list)
504 (let ((token (xpath-next-token last-token)))
505 (setq last-token (car token))
512 (xpath-assert (equal (xpath-steps "child::para")
513 '((xpath-child-axis (xpath-name-filter "para")))))
514 (xpath-assert (equal (xpath-steps "child::para/parent::*")
515 '((xpath-child-axis (xpath-name-filter "para"))
516 (xpath-parent-axis (xpath-name-filter "*")))))
517 (xpath-assert (equal (xpath-steps "child::para/parent::text()")
518 '((xpath-child-axis (xpath-name-filter "para"))
519 (xpath-parent-axis (xpath-node-type-filter "text")))))
520 (xpath-assert (equal (xpath-steps "child::*")
521 '((xpath-child-axis (xpath-name-filter "*")))))
522 (xpath-assert (equal (xpath-steps "child::foo/child::bar/child::test")
523 '((xpath-child-axis (xpath-name-filter "foo"))
524 (xpath-child-axis (xpath-name-filter "bar"))
525 (xpath-child-axis (xpath-name-filter "test")))))
526 (xpath-assert (equal (xpath-test-lex-string "child::*[position() = 1]")
527 '((CHILD "child" 1 . 6)
531 (FUNCTIONNAME "position" 10 . 18)
536 (RBRACK "]" 24 . 25))))
537 (xpath-assert (equal (xpath-steps "child::*[position() = 1]")
538 '((xpath-child-axis (xpath-name-filter "*")
539 (xpath-equal (xpath-function/position) "1")))))
540 (xpath-assert (equal (xpath-steps "child::*[position(1,2,3,4) = 1]")
541 '((xpath-child-axis (xpath-name-filter "*")
542 (xpath-equal (xpath-function/position
545 (xpath-assert (equal (xpath-steps "child::*[attribute::type=\"id\"]")
546 '((xpath-child-axis (xpath-name-filter "*")
547 (xpath-equal (xpath-resolve-steps
549 (quote ((xpath-attribute-axis
550 (xpath-name-filter "type")))))
554 (provide 'xpath-parser)
556 ;;; xpath-parser.el ends here