Re-refactor the core scanner's API, in order to get out from under the problem

of different parsers having different YYSTYPE unions that they want to use with it. I defined a new union core_YYSTYPE that is just the (very short) list of semantic values returned by the core scanner. I had originally worried that this would require an extra interface layer, but actually we can have parser.c's base_yylex (formerly filtered_base_yylex) take care of that at no extra cost. Names associated with the core scanner are now "core_yy_foo", with "base_yy_foo" being used in the core Bison parser and the parser.c interface layer. This solves the last serious stumbling block to eliminating plpgsql's separate lexer. One restriction that will still be present is that plpgsql and the core will have to agree on the token numbers assigned to tokens that can be returned by the core lexer. Since Bison doesn't seem willing to accept external assignments of those numbers, we'll have to live with decreeing that core and plpgsql grammars declare these tokens first and in the same order.
author: Tom Lane <tgl@sss.pgh.pa.us> 2009-11-09 18:38:48 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2009-11-09 18:38:48 +0000
commit: 10bcfa189bedaeaa6bfe8d7841ed3b17f23c0df4 (patch)
tree: 70b98c6fd252fb828a393d830322f64b37cd5e81 /src/include/parser
parent: 2ace38d226246b83e5cc4d8f4063a82a485ddc95 (diff)
2 files changed, 131 insertions, 69 deletions
diff --git a/src/include/parser/gramparse.h b/src/include/parser/gramparse.h
index 09c99091361..41774028b5b 100644
--- a/src/include/parser/gramparse.h
+++ b/src/include/parser/gramparse.h
@@ -11,7 +11,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.49 2009/11/05 23:24:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.50 2009/11/09 18:38:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -20,20 +20,11 @@
 #define GRAMPARSE_H
 
 #include "nodes/parsenodes.h"
-#include "parser/keywords.h"
+#include "parser/scanner.h"
 
 /*
- * We track token locations in terms of byte offsets from the start of the
- * source string, not the column number/line number representation that
- * bison uses by default.  Also, to minimize overhead we track only one
- * location (usually the first token location) for each construct, not
- * the beginning and ending locations as bison does by default.  It's
- * therefore sufficient to make YYLTYPE an int.
- */
-#define YYLTYPE  int
-
-/*
- * After defining YYLTYPE, it's safe to include gram.h.
+ * NB: include gram.h only AFTER including scanner.h, because scanner.h
+ * is what #defines YYLTYPE.
  */
 #include "parser/gram.h"
 
@@ -44,63 +35,25 @@
 typedef struct base_yy_extra_type
 {
 	/*
-	 * The string the lexer is physically scanning.  We keep this mainly so
-	 * that we can cheaply compute the offset of the current token (yytext).
+	 * Fields used by the core scanner.
 	 */
-	char	   *scanbuf;
-	Size		scanbuflen;
+	core_yy_extra_type core_yy_extra;
 
 	/*
-	 * The keyword list to use.
-	 */
-	const ScanKeyword *keywords;
-	int			num_keywords;
-
-	/*
-	 * literalbuf is used to accumulate literal values when multiple rules
-	 * are needed to parse a single literal.  Call startlit() to reset buffer
-	 * to empty, addlit() to add text.  NOTE: the string in literalbuf is
-	 * NOT necessarily null-terminated, but there always IS room to add a
-	 * trailing null at offset literallen.  We store a null only when we
-	 * need it.
-	 */
-	char	   *literalbuf;		/* palloc'd expandable buffer */
-	int			literallen;		/* actual current string length */
-	int			literalalloc;	/* current allocated buffer size */
-
-	int			xcdepth;		/* depth of nesting in slash-star comments */
-	char	   *dolqstart;		/* current $foo$ quote start string */
-
-	/* first part of UTF16 surrogate pair for Unicode escapes */
-	int32		utf16_first_part;
-
-	/* state variables for literal-lexing warnings */
-	bool		warn_on_first_escape;
-	bool		saw_non_ascii;
-
-	/*
-	 * State variables for filtered_base_yylex().
+	 * State variables for base_yylex().
 	 */
 	bool		have_lookahead;		/* is lookahead info valid? */
 	int			lookahead_token;	/* one-token lookahead */
-	YYSTYPE		lookahead_yylval;	/* yylval for lookahead token */
+	core_YYSTYPE lookahead_yylval;	/* yylval for lookahead token */
 	YYLTYPE		lookahead_yylloc;	/* yylloc for lookahead token */
 
 	/*
-	 * State variables that belong to the grammar, not the lexer.  It's
-	 * simpler to keep these here than to invent a separate structure.
-	 * These fields are unused/undefined if the lexer is invoked on its own.
+	 * State variables that belong to the grammar.
 	 */
-
 	List	   *parsetree;		/* final parse result is delivered here */
 } base_yy_extra_type;
 
 /*
- * The type of yyscanner is opaque outside scan.l.
- */
-typedef void *base_yyscan_t;
-
-/*
  * In principle we should use yyget_extra() to fetch the yyextra field
  * from a yyscanner struct.  However, flex always puts that field first,
  * and this is sufficiently performance-critical to make it seem worth
@@ -110,22 +63,11 @@ typedef void *base_yyscan_t;
 
 
 /* from parser.c */
-extern int	filtered_base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp,
-								base_yyscan_t yyscanner);
-
-/* from scan.l */
-extern base_yyscan_t scanner_init(const char *str,
-								  base_yy_extra_type *yyext,
-								  const ScanKeyword *keywords,
-								  int num_keywords);
-extern void scanner_finish(base_yyscan_t yyscanner);
 extern int	base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp,
-					   base_yyscan_t yyscanner);
-extern int	scanner_errposition(int location, base_yyscan_t yyscanner);
-extern void scanner_yyerror(const char *message, base_yyscan_t yyscanner);
+					   core_yyscan_t yyscanner);
 
 /* from gram.y */
 extern void parser_init(base_yy_extra_type *yyext);
-extern int	base_yyparse(base_yyscan_t yyscanner);
+extern int	base_yyparse(core_yyscan_t yyscanner);
 
 #endif   /* GRAMPARSE_H */
diff --git a/src/include/parser/scanner.h b/src/include/parser/scanner.h
new file mode 100644
index 00000000000..ccab1db862c
--- /dev/null
+++ b/src/include/parser/scanner.h
@@ -0,0 +1,120 @@
+/*-------------------------------------------------------------------------
+ *
+ * scanner.h
+ *		API for the core scanner (flex machine)
+ *
+ * The core scanner is also used by PL/pgsql, so we provide a public API
+ * for it.  However, the rest of the backend is only expected to use the
+ * higher-level API provided by parser.h.
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * $PostgreSQL: pgsql/src/include/parser/scanner.h,v 1.1 2009/11/09 18:38:48 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef SCANNER_H
+#define SCANNER_H
+
+#include "parser/keywords.h"
+
+/*
+ * The scanner returns extra data about scanned tokens in this union type.
+ * Note that this is a subset of the fields used in YYSTYPE of the bison
+ * parsers built atop the scanner.
+ */
+typedef union core_YYSTYPE
+{
+	int			ival;			/* for integer literals */
+	char	   *str;			/* for identifiers and non-integer literals */
+	const char *keyword;		/* canonical spelling of keywords */
+} core_YYSTYPE;
+
+/*
+ * We track token locations in terms of byte offsets from the start of the
+ * source string, not the column number/line number representation that
+ * bison uses by default.  Also, to minimize overhead we track only one
+ * location (usually the first token location) for each construct, not
+ * the beginning and ending locations as bison does by default.  It's
+ * therefore sufficient to make YYLTYPE an int.
+ */
+#define YYLTYPE  int
+
+/*
+ * Another important component of the scanner's API is the token code numbers.
+ * However, those are not defined in this file, because bison insists on
+ * defining them for itself.  The token codes used by the core scanner are
+ * the ASCII characters plus these:
+ *	%token <str>	IDENT FCONST SCONST BCONST XCONST Op
+ *	%token <ival>	ICONST PARAM
+ *	%token			TYPECAST DOT_DOT COLON_EQUALS
+ * The above token definitions *must* be the first ones declared in any
+ * bison parser built atop this scanner, so that they will have consistent
+ * numbers assigned to them (specifically, IDENT = 258 and so on).
+ */
+
+/*
+ * The YY_EXTRA data that a flex scanner allows us to pass around.
+ * Private state needed by the core scanner goes here.  Note that the actual
+ * yy_extra struct may be larger and have this as its first component, thus
+ * allowing the calling parser to keep some fields of its own in YY_EXTRA.
+ */
+typedef struct core_yy_extra_type
+{
+	/*
+	 * The string the scanner is physically scanning.  We keep this mainly so
+	 * that we can cheaply compute the offset of the current token (yytext).
+	 */
+	char	   *scanbuf;
+	Size		scanbuflen;
+
+	/*
+	 * The keyword list to use.
+	 */
+	const ScanKeyword *keywords;
+	int			num_keywords;
+
+	/*
+	 * literalbuf is used to accumulate literal values when multiple rules
+	 * are needed to parse a single literal.  Call startlit() to reset buffer
+	 * to empty, addlit() to add text.  NOTE: the string in literalbuf is
+	 * NOT necessarily null-terminated, but there always IS room to add a
+	 * trailing null at offset literallen.  We store a null only when we
+	 * need it.
+	 */
+	char	   *literalbuf;		/* palloc'd expandable buffer */
+	int			literallen;		/* actual current string length */
+	int			literalalloc;	/* current allocated buffer size */
+
+	int			xcdepth;		/* depth of nesting in slash-star comments */
+	char	   *dolqstart;		/* current $foo$ quote start string */
+
+	/* first part of UTF16 surrogate pair for Unicode escapes */
+	int32		utf16_first_part;
+
+	/* state variables for literal-lexing warnings */
+	bool		warn_on_first_escape;
+	bool		saw_non_ascii;
+} core_yy_extra_type;
+
+/*
+ * The type of yyscanner is opaque outside scan.l.
+ */
+typedef void *core_yyscan_t;
+
+
+/* Entry points in parser/scan.l */
+extern core_yyscan_t scanner_init(const char *str,
+								  core_yy_extra_type *yyext,
+								  const ScanKeyword *keywords,
+								  int num_keywords);
+extern void scanner_finish(core_yyscan_t yyscanner);
+extern int	core_yylex(core_YYSTYPE *lvalp, YYLTYPE *llocp,
+					   core_yyscan_t yyscanner);
+extern int	scanner_errposition(int location, core_yyscan_t yyscanner);
+extern void scanner_yyerror(const char *message, core_yyscan_t yyscanner);
+
+#endif   /* SCANNER_H */
author	Tom Lane <tgl@sss.pgh.pa.us>	2009-11-09 18:38:48 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2009-11-09 18:38:48 +0000
commit	10bcfa189bedaeaa6bfe8d7841ed3b17f23c0df4 (patch)
tree	70b98c6fd252fb828a393d830322f64b37cd5e81 /src/include/parser
parent	2ace38d226246b83e5cc4d8f4063a82a485ddc95 (diff)