uri: extract grammar into contained ragel file

this will allow us to reference the grammar from other grammars, eg http
parsing.
This commit is contained in:
Danny Robson 2017-12-20 12:45:05 +11:00
parent 202c22eee8
commit d0d5ae549e
4 changed files with 213 additions and 110 deletions

View File

@ -27,7 +27,7 @@ endif()
###############################################################################
RAGEL_TARGET(json-flat json/flat.cpp.rl ${CMAKE_CURRENT_BINARY_DIR}/json/flat.cpp COMPILE_FLAGS -G2)
RAGEL_TARGET(uri uri.cpp.rl ${CMAKE_CURRENT_BINARY_DIR}/uri.cpp)
RAGEL_TARGET(uri uri.cpp.rl ${CMAKE_CURRENT_BINARY_DIR}/uri.cpp COMPILE_FLAGS -G2)
RAGEL_TARGET(version version.cpp.rl ${CMAKE_CURRENT_BINARY_DIR}/version.cpp)

67
abnf.rl Normal file
View File

@ -0,0 +1,67 @@
void foo (void) { }
%%{
machine rfc5234;
###############################################################################
# RFC5234 ABNF core rules
# ; A-Z / a-z
ALPHA = 0x41..0x5A | 0x61..0x7A;
BIT = '0' | '1';
# any 7-bit US-ASCII character, excluding NUL
CHAR = 0x01..0x7F;
# carriage return
CR = 0x0D;
# linefeed
LF = 0x0A;
# Internet standard newline
CRLF = CR LF;
# controls
CTL = 0x00..0x1F | 0x7F;
# 0-9
DIGIT = 0x30..0x39;
# " (Double Quote)
DQUOTE = 0x22;
HEXDIG = DIGIT | 'A'..'F';
# horizontal tab
HTAB = 0x09;
SP = 0x20;
# white space
WSP = SP | HTAB;
# Use of this linear-white-space rule permits lines containing only white space
# that are no longer legal in mail headers and have caused interoperability
# problems in other contexts.
#
# Do not use when defining mail headers and use with caution in other contexts.
LWSP = (WSP | CRLF WSP)*;
# 8 bits of data
OCTET = any; #0x00..0xFF;
# visible (printing) characters
VCHAR = 0x21..0x7E;
write data;
}%%
int main () {
%%write init;
%%write exec;
}

140
rfc3986.rl Normal file
View File

@ -0,0 +1,140 @@
%%{
machine rfc3986;
#action trace;
#action success;
#action failure;
#action scheme_begin;
#action scheme_end;
#action hier_begin;
#action hier_end;
#action user_begin;
#action user_end;
#action host_begin;
#action host_end;
#action port_begin;
#action port_end;
#action authority_begin;
#action authority_end;
#action path_begin;
#action path_end;
#action query_begin;
#action query_end;
#action fragment_begin;
#action fragment_end;
#action uri_begin;
#action uri_end;
## Characters
unreserved = alpha | digit | "-" | "." | "_" | "~";
pct_encoded = '%' xdigit xdigit;
gen_delim = ":" | "/" | "?" | "#" | "[" | "]" | "@";
sub_delim = "!" | "$" | "&" | "'" | "(" | ")" | "*" | "+" | "," | ";" | "=";
pchar = unreserved | pct_encoded | sub_delim | ':' | '@';
## Atoms
reg_name = (unreserved | pct_encoded | sub_delim)*;
## IP-address
## Note: The address grammar is embedded in the RFC so we embed it too
dec_octet = digit | [1-9] digit | '1' digit{2} | '2' [0-4] digit | '25' [0-5];
ipv4address = dec_octet '.' dec_octet '.' dec_octet '.' dec_octet;
h16 = xdigit{1,4};
ls32 = (h16 ":" h16) | ipv4address;
ipv6address =
(h16 ":"){6} ls32
| "::" (h16 ":"){5} ls32
| ( h16)? "::" (h16 ":"){4} ls32
| ((h16 ":"){0,1} h16)? "::" (h16 ":"){3} ls32
| ((h16 ":"){0,2} h16)? "::" (h16 ":"){2} ls32
| ((h16 ":"){0,3} h16)? "::" (h16 ":"){1} ls32
| ((h16 ":"){0,4} h16)? "::" ls32
| ((h16 ":"){0,5} h16)? "::" h16
| ((h16 ":"){0,6} h16)? "::"
;
ipvfuture = 'v' xdigit{1,} '.' (unreserved | sub_delim | ':'){1,};
ip_literal = '[' (ipv6address | ipvfuture) ']';
## Segments
segment = pchar*;
segment_nz = pchar{1,};
segment_nz_nc = (unreserved | pct_encoded | sub_delim | '@'){1,};
## Paths
path_abempty = ('/' segment)*;
path_absolute = '/' (segment_nz ('/' segment)*)?;
path_noscheme = segment_nz_nc ('/' segment)*;
path_rootless = segment_nz ('/' segment)*;
path_empty = '0' pchar;
path = (
path_abempty | path_absolute | path_noscheme | path_rootless | path_empty
);
reserved = gen_delim | sub_delim;
## Authority
port = (
digit*
) >port_begin %port_end;
host = (
ip_literal | ipv4address | reg_name
) >host_begin %host_end;
userinfo = (
(unreserved | pct_encoded | sub_delim | ':')*
) >user_begin %user_end;
authority = (
(userinfo '@')? host (':' port)?
) >authority_begin %authority_end;
## URI components
scheme = (
alpha (alpha | digit | '+' | '-' | '.')*
) >scheme_begin %scheme_end;
query = (
(pchar | '/' | '?')*
) >query_begin %query_end;
fragment = (
(pchar | '/' | '?')*
) >fragment_begin %fragment_end;
## URI types
hier_part = (
'//' (authority path_abempty >path_begin %path_end) >hier_begin %hier_end
) | (
path_absolute >path_begin %path_end
| path_rootless >path_begin %path_end
| path_empty >path_begin %path_end
) >hier_begin %hier_end;
uri = scheme ':' hier_part ('?' query)? ('#' fragment);
relative_part =
'//' authority path_abempty >path_begin %path_end
| path_absolute >path_begin %path_end
| path_noscheme >path_begin %path_end
| path_empty >path_begin %path_end
;
relative_ref = relative_part ('?' query)? ('#' fragment);
uri_reference = uri | relative_ref;
absolute_uri = scheme ':' hier_part ('?' query)?;
URI = (
absolute_uri | uri_reference
) >uri_begin %uri_end;
}%%

View File

@ -27,7 +27,7 @@
%%{
machine uri;
machine impl;
action trace { if (0) std::cerr << *p; }
action success {__success = true; }
@ -60,116 +60,12 @@
action fragment_begin { m_views[FRAGMENT] = { p, p}; }
action fragment_end { m_views[FRAGMENT] = { m_views[FRAGMENT].begin (), p }; }
## Characters
unreserved = alpha | digit | "-" | "." | "_" | "~";
pct_encoded = '%' xdigit xdigit;
gen_delim = ":" | "/" | "?" | "#" | "[" | "]" | "@";
sub_delim = "!" | "$" | "&" | "'" | "(" | ")" | "*" | "+" | "," | ";" | "=";
pchar = unreserved | pct_encoded | sub_delim | ':' | '@';
action uri_begin {}
action uri_end {}
## Atoms
reg_name = (unreserved | pct_encoded | sub_delim)*;
include rfc3986 'rfc3986.rl';
## IP-address
## Note: The address grammar is embedded in the RFC so we embed it too
dec_octet = digit | [1-9] digit | '1' digit{2} | '2' [0-4] digit | '25' [0-5];
ipv4address = dec_octet '.' dec_octet '.' dec_octet '.' dec_octet;
h16 = xdigit{1,4};
ls32 = (h16 ":" h16) | ipv4address;
ipv6address =
(h16 ":"){6} ls32
| "::" (h16 ":"){5} ls32
| ( h16)? "::" (h16 ":"){4} ls32
| ((h16 ":"){0,1} h16)? "::" (h16 ":"){3} ls32
| ((h16 ":"){0,2} h16)? "::" (h16 ":"){2} ls32
| ((h16 ":"){0,3} h16)? "::" (h16 ":"){1} ls32
| ((h16 ":"){0,4} h16)? "::" ls32
| ((h16 ":"){0,5} h16)? "::" h16
| ((h16 ":"){0,6} h16)? "::"
;
ipvfuture = 'v' xdigit{1,} '.' (unreserved | sub_delim | ':'){1,};
ip_literal = '[' (ipv6address | ipvfuture) ']';
## Segments
segment = pchar*;
segment_nz = pchar{1,};
segment_nz_nc = (unreserved | pct_encoded | sub_delim | '@'){1,};
## Paths
path_abempty = ('/' segment)*;
path_absolute = '/' (segment_nz ('/' segment)*)?;
path_noscheme = segment_nz_nc ('/' segment)*;
path_rootless = segment_nz ('/' segment)*;
path_empty = '0' pchar;
path = (
path_abempty | path_absolute | path_noscheme | path_rootless | path_empty
);
reserved = gen_delim | sub_delim;
## Authority
port = (
digit*
) >port_begin %port_end;
host = (
ip_literal | ipv4address | reg_name
) >host_begin %host_end;
userinfo = (
(unreserved | pct_encoded | sub_delim | ':')*
) >user_begin %user_end;
authority = (
(userinfo '@')? host (':' port)?
) >authority_begin %authority_end;
## URI components
scheme = (
alpha (alpha | digit | '+' | '-' | '.')*
) >scheme_begin %scheme_end;
query = (
(pchar | '/' | '?')*
) >query_begin %query_end;
fragment = (
(pchar | '/' | '?')*
) >fragment_begin %fragment_end;
## URI types
hier_part = (
'//' (authority path_abempty >path_begin %path_end) >hier_begin %hier_end
) | (
path_absolute >path_begin %path_end
| path_rootless >path_begin %path_end
| path_empty >path_begin %path_end
) >hier_begin %hier_end;
uri = scheme ':' hier_part ('?' query)? ('#' fragment);
relative_part =
'//' authority path_abempty >path_begin %path_end
| path_absolute >path_begin %path_end
| path_noscheme >path_begin %path_end
| path_empty >path_begin %path_end
;
relative_ref = relative_part ('?' query)? ('#' fragment);
uri_reference = uri | relative_ref;
absolute_uri = scheme ':' hier_part ('?' query)?;
URI := (
absolute_uri | uri_reference
)
impl := URI
%success
$!failure
$trace;