From 84adefa331c4159d432d22840663c38f155cd4c1 Mon Sep 17 00:00:00 2001 From: Erlang/OTP Date: Fri, 20 Nov 2009 14:54:40 +0000 Subject: The R13B03 release. --- lib/parsetools/doc/src/leex.xml | 455 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 455 insertions(+) create mode 100644 lib/parsetools/doc/src/leex.xml (limited to 'lib/parsetools/doc/src/leex.xml') diff --git a/lib/parsetools/doc/src/leex.xml b/lib/parsetools/doc/src/leex.xml new file mode 100644 index 0000000000..c113b586df --- /dev/null +++ b/lib/parsetools/doc/src/leex.xml @@ -0,0 +1,455 @@ + + + + +
+ + 20092009 + Ericsson AB. All Rights Reserved. + + + Copyright (c) 2008,2009 Robert Virding. All rights reserved. + + + leex + Robert Virding + nobody + + nobody + + 2009-05-07 + A + leex.xml +
+ leex + Lexical analyzer generator for Erlang + +

A regular expression based lexical analyzer generator for + Erlang, similar to lex or flex.

+

The Leex module should be considered experimental + as it will be subject to changes in future releases.

+
+
+ DATA TYPES + +ErrorInfo = {ErrorLine,module(),error_descriptor()} +ErrorLine = integer() +Token = tuple() +
+ + + file(FileName) -> ok | error + file(FileName, Options) -> ok | error + Generate a lexical analyzer + + FileName = filename() + Options = Option | [Option] + Option = - see below - + FileReturn = {ok, Scannerfile} + | {ok, Scannerfile, Warnings} + | error + | {error, Warnings, Errors} + Scannerfile = filename() + Warnings = Errors = [{filename(), [ErrorInfo]}] + + +

Generates a lexical analyzer from the definition in the input + file. The input file has the extension .xrl. This is + added to the filename if it is not given. The resulting module + is the Xrl filename without the .xrl extension.

+ +

The current options are:

+ + dfa_graph +

Generates a .dot file which contains a + description of the DFA in a format which can be viewed with + Graphviz, www.graphviz.com.

+
+ {includefile,Includefile} +

Uses a specific or customised prologue file + instead of default + lib/parsetools/include/leexinc.hrl which is + otherwise included.

+
+ {report_errors, bool()} +

Causes errors to be printed as they occur. Default is + true.

+
+ {report_warnings, bool()} +

Causes warnings to be printed as they occur. Default is + true.

+
+ {report, bool()} +

This is a short form for both report_errors and + report_warnings.

+
+ {return_errors, bool()} +

If this flag is set, {error, Errors, Warnings} + is returned when there are errors. Default is false.

+
+ {return_warnings, bool()} +

If this flag is set, an extra field containing + Warnings is added to the tuple returned upon + success. Default is false.

+
+ {return, bool()} +

This is a short form for both return_errors and + return_warnings.

+
+ {scannerfile, Scannerfile} +

Scannerfile is the name of the file that + will contain the Erlang scanner code that is generated. + The default ("") is to add the extension + .erl to FileName stripped of the + .xrl extension.

+
+ {verbose, bool()} +

Outputs information from parsing the input file and + generating the internal tables.

+
+
+

Any of the Boolean options can be set to true by + stating the name of the option. For example, verbose + is equivalent to {verbose, true}.

+

Leex will add the extension .hrl to the + Includefile name and the extension .erl to the + Scannerfile name, unless the extension is already + there.

+
+
+ + format_error(ErrorInfo) -> Chars + Return an English description of a an error tuple. + + Chars = [char() | Chars] + + +

Returns a string which describes the error + ErrorInfo returned when there is an error in a + regular expression.

+
+
+
+ + +
+ GENERATED SCANNER EXPORTS +

The following functions are exported by the generated scanner.

+
+ + + + string(String) -> StringRet + string(String, StartLine) -> StringRet + Generated by Leex + + String = string() + StringRet = {ok,Tokens,EndLine} | ErrorInfo + Tokens = [Token] + EndLine = StartLine = integer() + + +

Scans String and returns all the tokens in it, or an + error.

+

It is an error if not all of the characters in + String are consumed.

+
+
+ + + token(Cont, Chars) -> {more,Cont1} | {done,TokenRet,RestChars} + + token(Cont, Chars, StartLine) -> {more,Cont1} + | {done,TokenRet,RestChars} + + Generated by Leex + + Cont = [] | Cont1 + Cont1 = tuple() + Chars = RestChars = string() | eof + TokenRet = {ok, Token, EndLine} + | {eof, EndLine} + | ErrorInfo + StartLine = EndLine = integer() + + +

This is a re-entrant call to try and scan one token from + Chars. If there are enough characters in Chars + to either scan a token or detect an error then this will be + returned with {done,...}. Otherwise + {cont,Cont} will be returned where Cont is + used in the next call to token() with more characters + to try an scan the token. This is continued until a token + has been scanned. Cont is initially [].

+ +

It is not designed to be called directly by an application + but used through the i/o system where it can typically be + called in an application by:

+ +io:request(InFile, {get_until,Prompt,Module,token,[Line]}) + -> TokenRet +
+
+ + + tokens(Cont, Chars) -> {more,Cont1} | {done,TokensRet,RestChars} + + tokens(Cont, Chars, StartLine) -> + {more,Cont1} | {done,TokensRet,RestChars} + + Generated by Leex + + Cont = [] | Cont1 + Cont1 = tuple() + Chars = RestChars = string() | eof + TokensRet = {ok, Tokens, EndLine} + | {eof, EndLine} + | ErrorInfo + Tokens = [Token] + StartLine = EndLine = integer() + + +

This is a re-entrant call to try and scan tokens from + Chars. If there are enough characters in Chars + to either scan tokens or detect an error then this will be + returned with {done,...}. Otherwise + {cont,Cont} will be returned where Cont is + used in the next call to tokens() with more + characters to try an scan the tokens. This is continued + until all tokens have been scanned. Cont is initially + [].

+ +

This functions differs from token in that it will + continue to scan tokens upto and including an + {end_token,Token} has been scanned (see next + section). It will then return all the tokens. This is + typically used for scanning grammars like Erlang where there + is an explicit end token, '.'. If no end token is + found then the whole file will be scanned and returned. If + an error occurs then all tokens upto and including the next + end token will be skipped.

+ +

It is not designed to be called directly by an application + but used through the i/o system where it can typically be + called in an application by:

+ +io:request(InFile, {get_until,Prompt,Module,tokens,[Line]}) + -> TokensRet +
+
+
+ +
+ Input File Format +

Erlang style comments starting with a % are allowed in + scanner files. A definition file has the following format:

+ +<Header> + +Definitions. + +<Macro Definitions> + +Rules. + +<Token Rules> + +Erlang code. + +<Erlang code> + +

The "Definitions.", "Rules." and "Erlang code." headings are + mandatory and must occur at the beginning of a source line. The + <Header>, <Macro Definitions> and <Erlang code> + sections may be empty but there must be at least one rule.

+ +

Macro definitions have the following format:

+ + +NAME = VALUE + +

and there must be spaces around =. Macros can be used in + the regular expressions of rules by writing {NAME}.

+ +

When macros are expanded in expressions the macro calls + are replaced by the macro value without any form of quoting or + enclosing in parentheses.

+ +

Rules have the following format:

+ + +<Regexp> : <Erlang code>. + +

The <Regexp> must occur at the start of a line and not + include any blanks; use \\t and \\s to include TAB + and SPACE characters in the regular expression. If <Regexp> + matches then the corresponding <Erlang code> is evaluated to + generate a token. With the Erlang code the following predefined + variables are available:

+ + + TokenChars +

A list of the characters in the matched token.

+
+ TokenLen +

The number of characters in the matched token.

+
+ TokenLine +

The line number where the token occurred.

+
+
+ +

The code must return:

+ + + {token,Token} +

Return Token to the caller.

+
+ {end_token,Token} +

Return Token and is last token in a tokens call.

+
+ skip_token +

Skip this token completely.

+
+ {error,ErrString} +

An error in the token, ErrString is a string + describing the error.

+
+
+ +

It is also possible to push back characters into the input + characters with the following returns:

+ + + {token,Token,PushBackList} + {end_token,Token,PushBackList} + {skip_token,PushBackList} + + +

These have the same meanings as the normal returns but the + characters in PushBackList will be prepended to the input + characters and scanned for the next token. Note that pushing + back a newline will mean the line numbering will no longer be + correct.

+ +

Pushing back characters gives you unexpected + possibilities to cause the scanner to loop!

+ +

The following example would match a simple Erlang integer or + float and return a token which could be sent to the Erlang + parser:

+ +D = [0-9] + +{D}+ : + {token,{integer,TokenLine,list_to_integer(TokenChars)}}. + +{D}+\\.{D}+((E|e)(\\+|\\-)?{D}+)? : + {token,{float,TokenLine,list_to_float(TokenChars)}}. + +

The Erlang code in the "Erlang code." section is written into + the output file directly after the module declaration and + predefined exports declaration so it is possible to add extra + exports, define imports and other attributes which are then + visible in the whole file.

+
+ +
+ Regular Expressions + +

The regular expressions allowed here is a subset of the set + found in egrep and in the AWK programming language, as + defined in the book, The AWK Programming Language, by A. V. Aho, + B. W. Kernighan, P. J. Weinberger. They are composed of the + following characters:

+ + + c +

Matches the non-metacharacter c.

+
+ \\c +

Matches the escape sequence or literal character c.

+
+ . +

Matches any character.

+
+ ^ +

Matches the beginning of a string.

+
+ $ +

Matches the end of a string.

+ [abc...] +

Character class, which matches any of the characters + abc.... Character ranges are specified by a pair of + characters separated by a -.

+
+ [^abc...] +

Negated character class, which matches any character + except abc....

+
+ r1 | r2 +

Alternation. It matches either r1 or r2.

+
+ r1r2 +

Concatenation. It matches r1 and then r2.

+
+ r+ +

Matches one or more rs.

+
+ r* +

Matches zero or more rs.

+
+ r? +

Matches zero or one rs.

+
+ (r) +

Grouping. It matches r.

+
+
+ +

The escape sequences allowed are the same as for Erlang strings:

+ + + \\b +

Backspace.

+ \\f +

Form feed.

+ \\n +

Newline (line feed).

+ \\r +

Carriage return.

+ \\t +

Tab.

+ \\e +

Escape.

+ \\v +

Vertical tab.

+ \\s +

Space.

+ \\d +

Delete.

+ \\ddd +

The octal value ddd.

+ \\xhh +

The hexadecimal value hh.

+ \\x{h...} +

The hexadecimal value h....

+ \\c +

Any other character literally, for example \\\\ for + backslash, \\" for ".

+
+
+ +

The following examples define Erlang data types:

+ +Atoms [a-z][0-9a-zA-Z_]* + +Variables [A-Z_][0-9a-zA-Z_]* + +Floats (\\+|-)?[0-9]+\\.[0-9]+((E|e)(\\+|-)?[0-9]+)? + +

Anchoring a regular expression with ^ and $ + is not implemented in the current version of Leex and just + generates a parse error.

+
+
-- cgit v1.2.3