From d0d5ae549e4811ed2ed274edc722bf591a567e28 Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Wed, 20 Dec 2017 12:45:05 +1100 Subject: [PATCH] uri: extract grammar into contained ragel file this will allow us to reference the grammar from other grammars, eg http parsing. --- CMakeLists.txt | 2 +- abnf.rl | 67 +++++++++++++++++++++++ rfc3986.rl | 140 +++++++++++++++++++++++++++++++++++++++++++++++++ uri.cpp.rl | 114 ++-------------------------------------- 4 files changed, 213 insertions(+), 110 deletions(-) create mode 100644 abnf.rl create mode 100644 rfc3986.rl diff --git a/CMakeLists.txt b/CMakeLists.txt index e0d4dacb..87c0a627 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ endif() ############################################################################### RAGEL_TARGET(json-flat json/flat.cpp.rl ${CMAKE_CURRENT_BINARY_DIR}/json/flat.cpp COMPILE_FLAGS -G2) -RAGEL_TARGET(uri uri.cpp.rl ${CMAKE_CURRENT_BINARY_DIR}/uri.cpp) +RAGEL_TARGET(uri uri.cpp.rl ${CMAKE_CURRENT_BINARY_DIR}/uri.cpp COMPILE_FLAGS -G2) RAGEL_TARGET(version version.cpp.rl ${CMAKE_CURRENT_BINARY_DIR}/version.cpp) diff --git a/abnf.rl b/abnf.rl new file mode 100644 index 00000000..86903cdc --- /dev/null +++ b/abnf.rl @@ -0,0 +1,67 @@ +void foo (void) { } + +%%{ + +machine rfc5234; + +############################################################################### +# RFC5234 ABNF core rules + +# ; A-Z / a-z +ALPHA = 0x41..0x5A | 0x61..0x7A; +BIT = '0' | '1'; + +# any 7-bit US-ASCII character, excluding NUL +CHAR = 0x01..0x7F; + +# carriage return +CR = 0x0D; + +# linefeed +LF = 0x0A; + +# Internet standard newline +CRLF = CR LF; + +# controls +CTL = 0x00..0x1F | 0x7F; + +# 0-9 +DIGIT = 0x30..0x39; + +# " (Double Quote) +DQUOTE = 0x22; + +HEXDIG = DIGIT | 'A'..'F'; + +# horizontal tab +HTAB = 0x09; + +SP = 0x20; + +# white space +WSP = SP | HTAB; + +# Use of this linear-white-space rule permits lines containing only white space +# that are no longer legal in mail headers and have caused interoperability +# problems in other contexts. +# +# Do not use when defining mail headers and use with caution in other contexts. +LWSP = (WSP | CRLF WSP)*; + +# 8 bits of data +OCTET = any; #0x00..0xFF; + +# visible (printing) characters +VCHAR = 0x21..0x7E; + + write data; + +}%% + +int main () { + + + %%write init; + %%write exec; +} diff --git a/rfc3986.rl b/rfc3986.rl new file mode 100644 index 00000000..c11abd0f --- /dev/null +++ b/rfc3986.rl @@ -0,0 +1,140 @@ +%%{ + machine rfc3986; + + #action trace; + #action success; + #action failure; + + #action scheme_begin; + #action scheme_end; + #action hier_begin; + #action hier_end; + #action user_begin; + #action user_end; + #action host_begin; + #action host_end; + #action port_begin; + #action port_end; + #action authority_begin; + #action authority_end; + #action path_begin; + #action path_end; + #action query_begin; + #action query_end; + #action fragment_begin; + #action fragment_end; + + #action uri_begin; + #action uri_end; + + ## Characters + unreserved = alpha | digit | "-" | "." | "_" | "~"; + pct_encoded = '%' xdigit xdigit; + gen_delim = ":" | "/" | "?" | "#" | "[" | "]" | "@"; + sub_delim = "!" | "$" | "&" | "'" | "(" | ")" | "*" | "+" | "," | ";" | "="; + pchar = unreserved | pct_encoded | sub_delim | ':' | '@'; + + ## Atoms + reg_name = (unreserved | pct_encoded | sub_delim)*; + + ## IP-address + ## Note: The address grammar is embedded in the RFC so we embed it too + dec_octet = digit | [1-9] digit | '1' digit{2} | '2' [0-4] digit | '25' [0-5]; + + ipv4address = dec_octet '.' dec_octet '.' dec_octet '.' dec_octet; + + h16 = xdigit{1,4}; + ls32 = (h16 ":" h16) | ipv4address; + + ipv6address = + (h16 ":"){6} ls32 + | "::" (h16 ":"){5} ls32 + | ( h16)? "::" (h16 ":"){4} ls32 + | ((h16 ":"){0,1} h16)? "::" (h16 ":"){3} ls32 + | ((h16 ":"){0,2} h16)? "::" (h16 ":"){2} ls32 + | ((h16 ":"){0,3} h16)? "::" (h16 ":"){1} ls32 + | ((h16 ":"){0,4} h16)? "::" ls32 + | ((h16 ":"){0,5} h16)? "::" h16 + | ((h16 ":"){0,6} h16)? "::" + ; + + ipvfuture = 'v' xdigit{1,} '.' (unreserved | sub_delim | ':'){1,}; + ip_literal = '[' (ipv6address | ipvfuture) ']'; + + ## Segments + segment = pchar*; + segment_nz = pchar{1,}; + segment_nz_nc = (unreserved | pct_encoded | sub_delim | '@'){1,}; + + ## Paths + path_abempty = ('/' segment)*; + path_absolute = '/' (segment_nz ('/' segment)*)?; + path_noscheme = segment_nz_nc ('/' segment)*; + path_rootless = segment_nz ('/' segment)*; + path_empty = '0' pchar; + + path = ( + path_abempty | path_absolute | path_noscheme | path_rootless | path_empty + ); + + reserved = gen_delim | sub_delim; + + ## Authority + port = ( + digit* + ) >port_begin %port_end; + + host = ( + ip_literal | ipv4address | reg_name + ) >host_begin %host_end; + + userinfo = ( + (unreserved | pct_encoded | sub_delim | ':')* + ) >user_begin %user_end; + + authority = ( + (userinfo '@')? host (':' port)? + ) >authority_begin %authority_end; + + + ## URI components + scheme = ( + alpha (alpha | digit | '+' | '-' | '.')* + ) >scheme_begin %scheme_end; + + query = ( + (pchar | '/' | '?')* + ) >query_begin %query_end; + + fragment = ( + (pchar | '/' | '?')* + ) >fragment_begin %fragment_end; + + ## URI types + hier_part = ( + '//' (authority path_abempty >path_begin %path_end) >hier_begin %hier_end + ) | ( + path_absolute >path_begin %path_end + | path_rootless >path_begin %path_end + | path_empty >path_begin %path_end + ) >hier_begin %hier_end; + + uri = scheme ':' hier_part ('?' query)? ('#' fragment); + + relative_part = + '//' authority path_abempty >path_begin %path_end + | path_absolute >path_begin %path_end + | path_noscheme >path_begin %path_end + | path_empty >path_begin %path_end + ; + + relative_ref = relative_part ('?' query)? ('#' fragment); + + uri_reference = uri | relative_ref; + + absolute_uri = scheme ':' hier_part ('?' query)?; + + URI = ( + absolute_uri | uri_reference + ) >uri_begin %uri_end; +}%% diff --git a/uri.cpp.rl b/uri.cpp.rl index a60f6f3c..a7485727 100644 --- a/uri.cpp.rl +++ b/uri.cpp.rl @@ -27,7 +27,7 @@ %%{ - machine uri; + machine impl; action trace { if (0) std::cerr << *p; } action success {__success = true; } @@ -60,116 +60,12 @@ action fragment_begin { m_views[FRAGMENT] = { p, p}; } action fragment_end { m_views[FRAGMENT] = { m_views[FRAGMENT].begin (), p }; } - ## Characters - unreserved = alpha | digit | "-" | "." | "_" | "~"; - pct_encoded = '%' xdigit xdigit; - gen_delim = ":" | "/" | "?" | "#" | "[" | "]" | "@"; - sub_delim = "!" | "$" | "&" | "'" | "(" | ")" | "*" | "+" | "," | ";" | "="; - pchar = unreserved | pct_encoded | sub_delim | ':' | '@'; + action uri_begin {} + action uri_end {} - ## Atoms - reg_name = (unreserved | pct_encoded | sub_delim)*; + include rfc3986 'rfc3986.rl'; - ## IP-address - ## Note: The address grammar is embedded in the RFC so we embed it too - dec_octet = digit | [1-9] digit | '1' digit{2} | '2' [0-4] digit | '25' [0-5]; - - ipv4address = dec_octet '.' dec_octet '.' dec_octet '.' dec_octet; - - h16 = xdigit{1,4}; - ls32 = (h16 ":" h16) | ipv4address; - - ipv6address = - (h16 ":"){6} ls32 - | "::" (h16 ":"){5} ls32 - | ( h16)? "::" (h16 ":"){4} ls32 - | ((h16 ":"){0,1} h16)? "::" (h16 ":"){3} ls32 - | ((h16 ":"){0,2} h16)? "::" (h16 ":"){2} ls32 - | ((h16 ":"){0,3} h16)? "::" (h16 ":"){1} ls32 - | ((h16 ":"){0,4} h16)? "::" ls32 - | ((h16 ":"){0,5} h16)? "::" h16 - | ((h16 ":"){0,6} h16)? "::" - ; - - ipvfuture = 'v' xdigit{1,} '.' (unreserved | sub_delim | ':'){1,}; - ip_literal = '[' (ipv6address | ipvfuture) ']'; - - ## Segments - segment = pchar*; - segment_nz = pchar{1,}; - segment_nz_nc = (unreserved | pct_encoded | sub_delim | '@'){1,}; - - ## Paths - path_abempty = ('/' segment)*; - path_absolute = '/' (segment_nz ('/' segment)*)?; - path_noscheme = segment_nz_nc ('/' segment)*; - path_rootless = segment_nz ('/' segment)*; - path_empty = '0' pchar; - - path = ( - path_abempty | path_absolute | path_noscheme | path_rootless | path_empty - ); - - reserved = gen_delim | sub_delim; - - ## Authority - port = ( - digit* - ) >port_begin %port_end; - - host = ( - ip_literal | ipv4address | reg_name - ) >host_begin %host_end; - - userinfo = ( - (unreserved | pct_encoded | sub_delim | ':')* - ) >user_begin %user_end; - - authority = ( - (userinfo '@')? host (':' port)? - ) >authority_begin %authority_end; - - - ## URI components - scheme = ( - alpha (alpha | digit | '+' | '-' | '.')* - ) >scheme_begin %scheme_end; - - query = ( - (pchar | '/' | '?')* - ) >query_begin %query_end; - - fragment = ( - (pchar | '/' | '?')* - ) >fragment_begin %fragment_end; - - ## URI types - hier_part = ( - '//' (authority path_abempty >path_begin %path_end) >hier_begin %hier_end - ) | ( - path_absolute >path_begin %path_end - | path_rootless >path_begin %path_end - | path_empty >path_begin %path_end - ) >hier_begin %hier_end; - - uri = scheme ':' hier_part ('?' query)? ('#' fragment); - - relative_part = - '//' authority path_abempty >path_begin %path_end - | path_absolute >path_begin %path_end - | path_noscheme >path_begin %path_end - | path_empty >path_begin %path_end - ; - - relative_ref = relative_part ('?' query)? ('#' fragment); - - uri_reference = uri | relative_ref; - - absolute_uri = scheme ':' hier_part ('?' query)?; - - URI := ( - absolute_uri | uri_reference - ) + impl := URI %success $!failure $trace;