From f1f3994d09d45f0e594a0543c5239f747408f1c7 Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 9 Oct 2020 00:22:14 +0100 Subject: [PATCH 01/13] conf: factor out list of regex into separate header this allows to include the regexes in another file and apply transformations and experiments. --- src/conf.c | 93 +----------------------------------------------- src/conf_regex.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 92 deletions(-) create mode 100644 src/conf_regex.h diff --git a/src/conf.c b/src/conf.c index e0d6634f..85fcd55a 100644 --- a/src/conf.c +++ b/src/conf.c @@ -40,35 +40,6 @@ #include "basicauth.h" #include "conf-tokens.h" -/* - * The configuration directives are defined in the structure below. Each - * directive requires a regular expression to match against, and a - * function to call when the regex is matched. - * - * Below are defined certain constant regular expression strings that - * can (and likely should) be used when building the regex for the - * given directive. - */ -#define DIGIT "[0-9]" -#define SPACE "[ \t]" -#define WS SPACE "+" -#define STR "\"([^\"]+)\"" -#define BOOL "(yes|on|no|off)" -#define INT "(()" DIGIT "+)" -#define ALNUM "([-a-z0-9._]+)" -#define USERNAME "([^:]*)" -#define PASSWORD "([^@]*)" -#define IP "((([0-9]{1,3})\\.){3}[0-9]{1,3})" -#define IPMASK "(" IP "(/" DIGIT "+)?)" -#define IPV6 "(" \ - "(([0-9a-f:]{2,39}))|" \ - "(([0-9a-f:]{0,29}:" IP "))" \ - ")" - -#define IPV6MASK "(" IPV6 "(/" DIGIT "+)?)" -#define BEGIN "^" SPACE "*" -#define END SPACE "*$" - /* * Limit the maximum number of substring matches to a reasonably high * number. Given the usual structure of the configuration file, sixteen @@ -184,69 +155,7 @@ struct { CONFFILE_HANDLER handler; regex_t *cre; } directives[] = { - /* string arguments */ - STDCONF (logfile, STR, handle_logfile), - STDCONF (pidfile, STR, handle_pidfile), - STDCONF (anonymous, STR, handle_anonymous), - STDCONF (viaproxyname, STR, handle_viaproxyname), - STDCONF (defaulterrorfile, STR, handle_defaulterrorfile), - STDCONF (statfile, STR, handle_statfile), - STDCONF (stathost, STR, handle_stathost), - STDCONF (xtinyproxy, BOOL, handle_xtinyproxy), - /* boolean arguments */ - STDCONF (syslog, BOOL, handle_syslog), - STDCONF (bindsame, BOOL, handle_bindsame), - STDCONF (disableviaheader, BOOL, handle_disableviaheader), - /* integer arguments */ - STDCONF (port, INT, handle_port), - STDCONF (maxclients, INT, handle_maxclients), - STDCONF (maxspareservers, INT, handle_obsolete), - STDCONF (minspareservers, INT, handle_obsolete), - STDCONF (startservers, INT, handle_obsolete), - STDCONF (maxrequestsperchild, INT, handle_obsolete), - STDCONF (timeout, INT, handle_timeout), - STDCONF (connectport, INT, handle_connectport), - /* alphanumeric arguments */ - STDCONF (user, ALNUM, handle_user), - STDCONF (group, ALNUM, handle_group), - /* ip arguments */ - STDCONF (listen, "(" IP "|" IPV6 ")", handle_listen), - STDCONF (allow, "(" "(" IPMASK "|" IPV6MASK ")" "|" ALNUM ")", - handle_allow), - STDCONF (deny, "(" "(" IPMASK "|" IPV6MASK ")" "|" ALNUM ")", - handle_deny), - STDCONF (bind, "(" IP "|" IPV6 ")", handle_bind), - /* other */ - STDCONF (basicauth, ALNUM WS ALNUM, handle_basicauth), - STDCONF (errorfile, INT WS STR, handle_errorfile), - STDCONF (addheader, STR WS STR, handle_addheader), - -#ifdef FILTER_ENABLE - /* filtering */ - STDCONF (filter, STR, handle_filter), - STDCONF (filterurls, BOOL, handle_filterurls), - STDCONF (filterextended, BOOL, handle_filterextended), - STDCONF (filterdefaultdeny, BOOL, handle_filterdefaultdeny), - STDCONF (filtercasesensitive, BOOL, handle_filtercasesensitive), -#endif -#ifdef REVERSE_SUPPORT - /* Reverse proxy arguments */ - STDCONF (reversebaseurl, STR, handle_reversebaseurl), - STDCONF (reverseonly, BOOL, handle_reverseonly), - STDCONF (reversemagic, BOOL, handle_reversemagic), - STDCONF (reversepath, STR "(" WS STR ")?", handle_reversepath), -#endif -#ifdef UPSTREAM_SUPPORT - STDCONF (upstream, - "(" "(none)" WS STR ")|" \ - "(" "(http|socks4|socks5)" WS \ - "(" USERNAME /*username*/ ":" PASSWORD /*password*/ "@" ")?" - "(" IP "|" ALNUM ")" - ":" INT "(" WS STR ")?" ")", handle_upstream), -#endif - /* loglevel */ - STDCONF (loglevel, "(critical|error|warning|notice|connect|info)", - handle_loglevel) +#include "conf_regex.h" }; const unsigned int ndirectives = sizeof (directives) / sizeof (directives[0]); diff --git a/src/conf_regex.h b/src/conf_regex.h new file mode 100644 index 00000000..e52cc070 --- /dev/null +++ b/src/conf_regex.h @@ -0,0 +1,93 @@ +/* + * The configuration directives are defined in the structure below. Each + * directive requires a regular expression to match against, and a + * function to call when the regex is matched. + * + * Below are defined certain constant regular expression strings that + * can (and likely should) be used when building the regex for the + * given directive. + */ +#define DIGIT "[0-9]" +#define SPACE "[ \t]" +#define WS SPACE "+" +#define STR "\"([^\"]+)\"" +#define BOOL "(yes|on|no|off)" +#define INT "(()" DIGIT "+)" +#define ALNUM "([-a-z0-9._]+)" +#define USERNAME "([^:]*)" +#define PASSWORD "([^@]*)" +#define IP "((([0-9]{1,3})\\.){3}[0-9]{1,3})" +#define IPMASK "(" IP "(/" DIGIT "+)?)" +#define IPV6 "(" \ + "(([0-9a-f:]{2,39}))|" \ + "(([0-9a-f:]{0,29}:" IP "))" \ + ")" + +#define IPV6MASK "(" IPV6 "(/" DIGIT "+)?)" +#define BEGIN "^" SPACE "*" +#define END SPACE "*$" + + +STDCONF (logfile, STR, handle_logfile), +STDCONF (pidfile, STR, handle_pidfile), +STDCONF (anonymous, STR, handle_anonymous), +STDCONF (viaproxyname, STR, handle_viaproxyname), +STDCONF (defaulterrorfile, STR, handle_defaulterrorfile), +STDCONF (statfile, STR, handle_statfile), +STDCONF (stathost, STR, handle_stathost), +STDCONF (xtinyproxy, BOOL, handle_xtinyproxy), +/* boolean arguments */ +STDCONF (syslog, BOOL, handle_syslog), +STDCONF (bindsame, BOOL, handle_bindsame), +STDCONF (disableviaheader, BOOL, handle_disableviaheader), +/* integer arguments */ +STDCONF (port, INT, handle_port), +STDCONF (maxclients, INT, handle_maxclients), +STDCONF (maxspareservers, INT, handle_obsolete), +STDCONF (minspareservers, INT, handle_obsolete), +STDCONF (startservers, INT, handle_obsolete), +STDCONF (maxrequestsperchild, INT, handle_obsolete), +STDCONF (timeout, INT, handle_timeout), +STDCONF (connectport, INT, handle_connectport), +/* alphanumeric arguments */ +STDCONF (user, ALNUM, handle_user), +STDCONF (group, ALNUM, handle_group), +/* ip arguments */ +STDCONF (listen, "(" IP "|" IPV6 ")", handle_listen), +STDCONF (allow, "(" "(" IPMASK "|" IPV6MASK ")" "|" ALNUM ")", + handle_allow), +STDCONF (deny, "(" "(" IPMASK "|" IPV6MASK ")" "|" ALNUM ")", + handle_deny), +STDCONF (bind, "(" IP "|" IPV6 ")", handle_bind), +/* other */ +STDCONF (basicauth, ALNUM WS ALNUM, handle_basicauth), +STDCONF (errorfile, INT WS STR, handle_errorfile), +STDCONF (addheader, STR WS STR, handle_addheader), + +#ifdef FILTER_ENABLE +/* filtering */ +STDCONF (filter, STR, handle_filter), +STDCONF (filterurls, BOOL, handle_filterurls), +STDCONF (filterextended, BOOL, handle_filterextended), +STDCONF (filterdefaultdeny, BOOL, handle_filterdefaultdeny), +STDCONF (filtercasesensitive, BOOL, handle_filtercasesensitive), +#endif +#ifdef REVERSE_SUPPORT +/* Reverse proxy arguments */ +STDCONF (reversebaseurl, STR, handle_reversebaseurl), +STDCONF (reverseonly, BOOL, handle_reverseonly), +STDCONF (reversemagic, BOOL, handle_reversemagic), +STDCONF (reversepath, STR "(" WS STR ")?", handle_reversepath), +#endif +#ifdef UPSTREAM_SUPPORT +STDCONF (upstream, + "(" "(none)" WS STR ")|" \ + "(" "(http|socks4|socks5)" WS \ + "(" USERNAME /*username*/ ":" PASSWORD /*password*/ "@" ")?" + "(" IP "|" ALNUM ")" + ":" INT "(" WS STR ")?" ")", handle_upstream), +#endif +/* loglevel */ +STDCONF (loglevel, "(critical|error|warning|notice|connect|info)", + handle_loglevel) + From 3eb238634a9a32f3a64605d021dd1e3cdef707a2 Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 9 Oct 2020 00:23:33 +0100 Subject: [PATCH 02/13] conf: properly escape tab in whitespace class --- src/conf_regex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/conf_regex.h b/src/conf_regex.h index e52cc070..d87073a3 100644 --- a/src/conf_regex.h +++ b/src/conf_regex.h @@ -8,7 +8,7 @@ * given directive. */ #define DIGIT "[0-9]" -#define SPACE "[ \t]" +#define SPACE "[ \\t]" #define WS SPACE "+" #define STR "\"([^\"]+)\"" #define BOOL "(yes|on|no|off)" From b07f7a8422af661982c5afeb5289f18e562c7f99 Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 9 Oct 2020 00:38:13 +0100 Subject: [PATCH 03/13] conf: remove empty parens group from regex using an empty group () is not defined in the posix spec, and as such "undefined behaviour", even though it happened to work with both GLIBC and MUSL libc, as well as with oniguruma's POSIX compatibility API. we used this idiom as a trick when refactoring the regex parsing, in order not to change the match indices of all the handler functions, ignorant that this is not explicitly allowed by the spec. to make future refactoring easier, we introduce a MGROUP1 macro that's added to each match group index, so we have only a single knob to turn in case a similar change becomes necessary again. --- src/conf.c | 94 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/src/conf.c b/src/conf.c index 85fcd55a..21a661c6 100644 --- a/src/conf.c +++ b/src/conf.c @@ -142,7 +142,7 @@ static void config_free_regex (void); * do not follow the pattern above. This macro is for convenience * only. */ -#define STDCONF(d, re, func) [CD_ ## d] = { BEGIN "()" WS re END, func, NULL } +#define STDCONF(d, re, func) [CD_ ## d] = { BEGIN WS re END, func, NULL } /* * Holds the regular expression used to match the configuration directive, @@ -534,19 +534,21 @@ set_int_arg (unsigned int *var, const char *line, regmatch_t * match) * ***********************************************************************/ +#define MGROUP1 -1 + static HANDLE_FUNC (handle_logfile) { - return set_string_arg (&conf->logf_name, line, &match[2]); + return set_string_arg (&conf->logf_name, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_pidfile) { - return set_string_arg (&conf->pidpath, line, &match[2]); + return set_string_arg (&conf->pidpath, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_anonymous) { - char *arg = get_string_arg (line, &match[2]); + char *arg = get_string_arg (line, &match[MGROUP1+2]); if (!arg) return -1; @@ -562,7 +564,7 @@ static HANDLE_FUNC (handle_anonymous) static HANDLE_FUNC (handle_viaproxyname) { - int r = set_string_arg (&conf->via_proxy_name, line, &match[2]); + int r = set_string_arg (&conf->via_proxy_name, line, &match[MGROUP1+2]); if (r) return r; @@ -574,7 +576,7 @@ static HANDLE_FUNC (handle_viaproxyname) static HANDLE_FUNC (handle_disableviaheader) { - int r = set_bool_arg (&conf->disable_viaheader, line, &match[2]); + int r = set_bool_arg (&conf->disable_viaheader, line, &match[MGROUP1+2]); if (r) { return r; @@ -587,17 +589,17 @@ static HANDLE_FUNC (handle_disableviaheader) static HANDLE_FUNC (handle_defaulterrorfile) { - return set_string_arg (&conf->errorpage_undef, line, &match[2]); + return set_string_arg (&conf->errorpage_undef, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_statfile) { - return set_string_arg (&conf->statpage, line, &match[2]); + return set_string_arg (&conf->statpage, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_stathost) { - int r = set_string_arg (&conf->stathost, line, &match[2]); + int r = set_string_arg (&conf->stathost, line, &match[MGROUP1+2]); if (r) return r; @@ -608,7 +610,7 @@ static HANDLE_FUNC (handle_stathost) static HANDLE_FUNC (handle_xtinyproxy) { #ifdef XTINYPROXY_ENABLE - return set_bool_arg (&conf->add_xtinyproxy, line, &match[2]); + return set_bool_arg (&conf->add_xtinyproxy, line, &match[MGROUP1+2]); #else fprintf (stderr, "XTinyproxy NOT Enabled! Recompile with --enable-xtinyproxy\n"); @@ -618,12 +620,12 @@ static HANDLE_FUNC (handle_xtinyproxy) static HANDLE_FUNC (handle_syslog) { - return set_bool_arg (&conf->syslog, line, &match[2]); + return set_bool_arg (&conf->syslog, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_bindsame) { - int r = set_bool_arg (&conf->bindsame, line, &match[2]); + int r = set_bool_arg (&conf->bindsame, line, &match[MGROUP1+2]); if (r) return r; @@ -633,7 +635,7 @@ static HANDLE_FUNC (handle_bindsame) static HANDLE_FUNC (handle_port) { - set_int_arg (&conf->port, line, &match[2]); + set_int_arg (&conf->port, line, &match[MGROUP1+2]); if (conf->port > 65535) { fprintf (stderr, "Bad port number (%d) supplied for Port.\n", @@ -646,7 +648,7 @@ static HANDLE_FUNC (handle_port) static HANDLE_FUNC (handle_maxclients) { - set_int_arg (&conf->maxclients, line, &match[2]); + set_int_arg (&conf->maxclients, line, &match[MGROUP1+2]); return 0; } @@ -659,24 +661,24 @@ static HANDLE_FUNC (handle_obsolete) static HANDLE_FUNC (handle_timeout) { - return set_int_arg (&conf->idletimeout, line, &match[2]); + return set_int_arg (&conf->idletimeout, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_connectport) { - add_connect_port_allowed (get_long_arg (line, &match[2]), + add_connect_port_allowed (get_long_arg (line, &match[MGROUP1+2]), &conf->connect_ports); return 0; } static HANDLE_FUNC (handle_user) { - return set_string_arg (&conf->user, line, &match[2]); + return set_string_arg (&conf->user, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_group) { - return set_string_arg (&conf->group, line, &match[2]); + return set_string_arg (&conf->group, line, &match[MGROUP1+2]); } static void warn_invalid_address(char *arg, unsigned long lineno) { @@ -685,7 +687,7 @@ static void warn_invalid_address(char *arg, unsigned long lineno) { static HANDLE_FUNC (handle_allow) { - char *arg = get_string_arg (line, &match[2]); + char *arg = get_string_arg (line, &match[MGROUP1+2]); if(insert_acl (arg, ACL_ALLOW, &conf->access_list) < 0) warn_invalid_address (arg, lineno); @@ -695,7 +697,7 @@ static HANDLE_FUNC (handle_allow) static HANDLE_FUNC (handle_deny) { - char *arg = get_string_arg (line, &match[2]); + char *arg = get_string_arg (line, &match[MGROUP1+2]); if(insert_acl (arg, ACL_DENY, &conf->access_list) < 0) warn_invalid_address (arg, lineno); @@ -705,7 +707,7 @@ static HANDLE_FUNC (handle_deny) static HANDLE_FUNC (handle_bind) { - int r = set_string_arg (&conf->bind_address, line, &match[2]); + int r = set_string_arg (&conf->bind_address, line, &match[MGROUP1+2]); if (r) return r; @@ -716,7 +718,7 @@ static HANDLE_FUNC (handle_bind) static HANDLE_FUNC (handle_listen) { - char *arg = get_string_arg (line, &match[2]); + char *arg = get_string_arg (line, &match[MGROUP1+2]); if (arg == NULL) { return -1; @@ -748,8 +750,8 @@ static HANDLE_FUNC (handle_errorfile) * present. This is why the "string" is located at * match[4] (rather than the more intuitive match[3]. */ - unsigned long int err = get_long_arg (line, &match[2]); - char *page = get_string_arg (line, &match[4]); + unsigned long int err = get_long_arg (line, &match[MGROUP1+2]); + char *page = get_string_arg (line, &match[MGROUP1+4]); if(add_new_errorpage (conf, page, err) < 0) { CP_WARN ("add_new_errorpage() failed: '%s'", page); @@ -760,8 +762,8 @@ static HANDLE_FUNC (handle_errorfile) static HANDLE_FUNC (handle_addheader) { - char *name = get_string_arg (line, &match[2]); - char *value = get_string_arg (line, &match[3]); + char *name = get_string_arg (line, &match[MGROUP1+2]); + char *value = get_string_arg (line, &match[MGROUP1+3]); http_header_t header; if (!conf->add_headers) { @@ -802,7 +804,7 @@ static HANDLE_FUNC (handle_loglevel) sizeof (log_levels) / sizeof (log_levels[0]); unsigned int i; - char *arg = get_string_arg (line, &match[2]); + char *arg = get_string_arg (line, &match[MGROUP1+2]); for (i = 0; i != nlevels; ++i) { if (!strcasecmp (arg, log_levels[i].string)) { @@ -819,10 +821,10 @@ static HANDLE_FUNC (handle_loglevel) static HANDLE_FUNC (handle_basicauth) { char *user, *pass; - user = get_string_arg(line, &match[2]); + user = get_string_arg(line, &match[MGROUP1+2]); if (!user) return -1; - pass = get_string_arg(line, &match[3]); + pass = get_string_arg(line, &match[MGROUP1+3]); if (!pass) { safefree (user); return -1; @@ -840,48 +842,48 @@ static HANDLE_FUNC (handle_basicauth) #ifdef FILTER_ENABLE static HANDLE_FUNC (handle_filter) { - return set_string_arg (&conf->filter, line, &match[2]); + return set_string_arg (&conf->filter, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_filterurls) { - return set_bool_arg (&conf->filter_url, line, &match[2]); + return set_bool_arg (&conf->filter_url, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_filterextended) { - return set_bool_arg (&conf->filter_extended, line, &match[2]); + return set_bool_arg (&conf->filter_extended, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_filterdefaultdeny) { - assert (match[2].rm_so != -1); + assert (match[MGROUP1+2].rm_so != -1); - if (get_bool_arg (line, &match[2])) + if (get_bool_arg (line, &match[MGROUP1+2])) filter_set_default_policy (FILTER_DEFAULT_DENY); return 0; } static HANDLE_FUNC (handle_filtercasesensitive) { - return set_bool_arg (&conf->filter_casesensitive, line, &match[2]); + return set_bool_arg (&conf->filter_casesensitive, line, &match[MGROUP1+2]); } #endif #ifdef REVERSE_SUPPORT static HANDLE_FUNC (handle_reverseonly) { - return set_bool_arg (&conf->reverseonly, line, &match[2]); + return set_bool_arg (&conf->reverseonly, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_reversemagic) { - return set_bool_arg (&conf->reversemagic, line, &match[2]); + return set_bool_arg (&conf->reversemagic, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_reversebaseurl) { - return set_string_arg (&conf->reversebaseurl, line, &match[2]); + return set_string_arg (&conf->reversebaseurl, line, &match[MGROUP1+2]); } static HANDLE_FUNC (handle_reversepath) @@ -891,12 +893,12 @@ static HANDLE_FUNC (handle_reversepath) */ char *arg1, *arg2; - arg1 = get_string_arg (line, &match[2]); + arg1 = get_string_arg (line, &match[MGROUP1+2]); if (!arg1) return -1; - if (match[4].rm_so != -1) { - arg2 = get_string_arg (line, &match[4]); + if (match[MGROUP1+4].rm_so != -1) { + arg2 = get_string_arg (line, &match[MGROUP1+4]); if (!arg2) { safefree (arg1); return -1; @@ -937,12 +939,12 @@ static HANDLE_FUNC (handle_upstream) enum proxy_type pt; enum upstream_build_error ube; - if (match[3].rm_so != -1) { - tmp = get_string_arg (line, &match[3]); + if (match[MGROUP1+3].rm_so != -1) { + tmp = get_string_arg (line, &match[MGROUP1+3]); if(!strcmp(tmp, "none")) { safefree(tmp); - if (match[4].rm_so == -1) return -1; - domain = get_string_arg (line, &match[4]); + if (match[MGROUP1+4].rm_so == -1) return -1; + domain = get_string_arg (line, &match[MGROUP1+4]); if (!domain) return -1; ube = upstream_add (NULL, 0, domain, 0, 0, PT_NONE, &conf->upstream_list); @@ -951,7 +953,7 @@ static HANDLE_FUNC (handle_upstream) } } - mi = 6; + mi = MGROUP1+6; tmp = get_string_arg (line, &match[mi]); pt = pt_from_string(tmp); From 393e51ba4589e03c7dd245475cdc57ac20a4c10d Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 9 Oct 2020 01:00:56 +0100 Subject: [PATCH 04/13] conf: remove second instance of empty parens ERE group likewise --- src/conf.c | 11 ++--------- src/conf_regex.h | 2 +- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/conf.c b/src/conf.c index 21a661c6..2ce1b2c7 100644 --- a/src/conf.c +++ b/src/conf.c @@ -743,15 +743,8 @@ static HANDLE_FUNC (handle_listen) static HANDLE_FUNC (handle_errorfile) { - /* - * Because an integer is defined as ((0x)?[[:digit:]]+) _two_ - * match places are used. match[2] matches the full digit - * string, while match[3] matches only the "0x" part if - * present. This is why the "string" is located at - * match[4] (rather than the more intuitive match[3]. - */ unsigned long int err = get_long_arg (line, &match[MGROUP1+2]); - char *page = get_string_arg (line, &match[MGROUP1+4]); + char *page = get_string_arg (line, &match[MGROUP1+3]); if(add_new_errorpage (conf, page, err) < 0) { CP_WARN ("add_new_errorpage() failed: '%s'", page); @@ -974,7 +967,7 @@ static HANDLE_FUNC (handle_upstream) mi += 5; port = (int) get_long_arg (line, &match[mi]); - mi += 3; + mi += 2; if (match[mi].rm_so != -1) domain = get_string_arg (line, &match[mi]); diff --git a/src/conf_regex.h b/src/conf_regex.h index d87073a3..2350aa29 100644 --- a/src/conf_regex.h +++ b/src/conf_regex.h @@ -12,7 +12,7 @@ #define WS SPACE "+" #define STR "\"([^\"]+)\"" #define BOOL "(yes|on|no|off)" -#define INT "(()" DIGIT "+)" +#define INT "(" DIGIT "+)" #define ALNUM "([-a-z0-9._]+)" #define USERNAME "([^:]*)" #define PASSWORD "([^@]*)" From 173c5b66a7b8bdfad0d4ae25c138ccf0f3605eda Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 9 Oct 2020 01:04:44 +0100 Subject: [PATCH 05/13] conf: remove obsolete whitespace from regex start we already deal with leading whitespace before a command in a manual way before comparing keywords. --- src/conf_regex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/conf_regex.h b/src/conf_regex.h index 2350aa29..240d6641 100644 --- a/src/conf_regex.h +++ b/src/conf_regex.h @@ -24,7 +24,7 @@ ")" #define IPV6MASK "(" IPV6 "(/" DIGIT "+)?)" -#define BEGIN "^" SPACE "*" +#define BEGIN "^" #define END SPACE "*$" From 57f932a33b9be4558f7b16c5643526fee9b5cced Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 9 Oct 2020 01:26:50 +0100 Subject: [PATCH 06/13] conf: skip leading whitespace instead of adding it to each regex --- src/conf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/conf.c b/src/conf.c index 2ce1b2c7..f6af3844 100644 --- a/src/conf.c +++ b/src/conf.c @@ -142,7 +142,7 @@ static void config_free_regex (void); * do not follow the pattern above. This macro is for convenience * only. */ -#define STDCONF(d, re, func) [CD_ ## d] = { BEGIN WS re END, func, NULL } +#define STDCONF(d, re, func) [CD_ ## d] = { BEGIN re END, func, NULL } /* * Holds the regular expression used to match the configuration directive, @@ -318,7 +318,7 @@ static int check_match (struct config_s *conf, const char *line, */ static int config_parse (struct config_s *conf, FILE * f) { - char buffer[LINE_MAX], *p, *q, c; + char buffer[LINE_MAX], *p, *q; const struct config_directive_entry *e; unsigned long lineno = 1; @@ -329,10 +329,10 @@ static int config_parse (struct config_s *conf, FILE * f) if(!*p) continue; q = p; while(!isspace(*q))q++; - c = *q; *q = 0; e = config_directive_find(p, strlen(p)); - *q = c; + ++q; + while(isspace(*q))++q; if (!e || e->value == CD_NIL || check_match (conf, q, lineno, e->value)) { fprintf (stderr, "ERROR: Syntax error on line %lu\n", lineno); return 1; From 86379b4b66d5c7c491b8b5ff3131030dc614a671 Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 9 Oct 2020 01:43:46 +0100 Subject: [PATCH 07/13] conf: parse regexes case-sensitive rather than treating everything as case insensitive, we explicitly allow upper/lowercase where it makes sense. --- src/conf.c | 2 +- src/conf_regex.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/conf.c b/src/conf.c index f6af3844..d40d3a34 100644 --- a/src/conf.c +++ b/src/conf.c @@ -262,7 +262,7 @@ config_init (void) r = regcomp (directives[i].cre, directives[i].re, - REG_EXTENDED | REG_ICASE | REG_NEWLINE); + REG_EXTENDED | REG_NEWLINE); if (r) return r; } diff --git a/src/conf_regex.h b/src/conf_regex.h index 240d6641..281e67b4 100644 --- a/src/conf_regex.h +++ b/src/conf_regex.h @@ -11,16 +11,16 @@ #define SPACE "[ \\t]" #define WS SPACE "+" #define STR "\"([^\"]+)\"" -#define BOOL "(yes|on|no|off)" +#define BOOL "([Yy][Ee][Ss]|[Oo][Nn]|[Nn][Oo]|[Oo][Ff][Ff])" #define INT "(" DIGIT "+)" -#define ALNUM "([-a-z0-9._]+)" +#define ALNUM "([-A-Za-z0-9._]+)" #define USERNAME "([^:]*)" #define PASSWORD "([^@]*)" #define IP "((([0-9]{1,3})\\.){3}[0-9]{1,3})" #define IPMASK "(" IP "(/" DIGIT "+)?)" #define IPV6 "(" \ - "(([0-9a-f:]{2,39}))|" \ - "(([0-9a-f:]{0,29}:" IP "))" \ + "(([0-9a-fA-F:]{2,39}))|" \ + "(([0-9a-fA-F:]{0,29}:" IP "))" \ ")" #define IPV6MASK "(" IPV6 "(/" DIGIT "+)?)" @@ -88,6 +88,6 @@ STDCONF (upstream, ":" INT "(" WS STR ")?" ")", handle_upstream), #endif /* loglevel */ -STDCONF (loglevel, "(critical|error|warning|notice|connect|info)", +STDCONF (loglevel, "([Cc]ritical|[Ee]rror|[Ww]arning|[Nn]otice|[Cc]onnect|[Ii]nfo)", handle_loglevel) From 22f059dc5e350b6d18a95265b8d4776d8adb9dca Mon Sep 17 00:00:00 2001 From: rofl0r Date: Mon, 12 Oct 2020 20:04:45 +0100 Subject: [PATCH 08/13] conf: simplify ipv4 regex use one matching group rather than 3. --- src/conf.c | 2 +- src/conf_regex.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/conf.c b/src/conf.c index d40d3a34..0af3c6ac 100644 --- a/src/conf.c +++ b/src/conf.c @@ -964,7 +964,7 @@ static HANDLE_FUNC (handle_upstream) ip = get_string_arg (line, &match[mi]); if (!ip) return -1; - mi += 5; + mi += 3; port = (int) get_long_arg (line, &match[mi]); mi += 2; diff --git a/src/conf_regex.h b/src/conf_regex.h index 281e67b4..c2d78f95 100644 --- a/src/conf_regex.h +++ b/src/conf_regex.h @@ -16,7 +16,7 @@ #define ALNUM "([-A-Za-z0-9._]+)" #define USERNAME "([^:]*)" #define PASSWORD "([^@]*)" -#define IP "((([0-9]{1,3})\\.){3}[0-9]{1,3})" +#define IP "([0-9]+[.][0-9]+[.][0-9]+[.][0-9]+)" #define IPMASK "(" IP "(/" DIGIT "+)?)" #define IPV6 "(" \ "(([0-9a-fA-F:]{2,39}))|" \ From ae4cbcabd1e36ea6ca3dbb8b05b093504d46060d Mon Sep 17 00:00:00 2001 From: rofl0r Date: Thu, 15 Oct 2020 22:36:10 +0100 Subject: [PATCH 09/13] conf: remove trailing whitespace via C code, not regex --- src/conf.c | 3 +++ src/conf_regex.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/conf.c b/src/conf.c index 0af3c6ac..0b3a1a1a 100644 --- a/src/conf.c +++ b/src/conf.c @@ -333,6 +333,9 @@ static int config_parse (struct config_s *conf, FILE * f) e = config_directive_find(p, strlen(p)); ++q; while(isspace(*q))++q; + p = q; + while(*p && *p != '\n') ++p; + while(isspace(*p)) *(p--) = 0; if (!e || e->value == CD_NIL || check_match (conf, q, lineno, e->value)) { fprintf (stderr, "ERROR: Syntax error on line %lu\n", lineno); return 1; diff --git a/src/conf_regex.h b/src/conf_regex.h index c2d78f95..2f3f3725 100644 --- a/src/conf_regex.h +++ b/src/conf_regex.h @@ -25,7 +25,7 @@ #define IPV6MASK "(" IPV6 "(/" DIGIT "+)?)" #define BEGIN "^" -#define END SPACE "*$" +#define END "$" STDCONF (logfile, STR, handle_logfile), From dabfd1ad6c54077051f2109a77a16efa654b806e Mon Sep 17 00:00:00 2001 From: rofl0r Date: Thu, 15 Oct 2020 22:39:46 +0100 Subject: [PATCH 10/13] conf: remove pointless assert() statement --- src/conf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/conf.c b/src/conf.c index 0b3a1a1a..ebb1bfd0 100644 --- a/src/conf.c +++ b/src/conf.c @@ -249,7 +249,6 @@ config_init (void) unsigned int i, r; for (i = 0; i != ndirectives; ++i) { - assert (!directives[i].cre); if (!directives[i].handler) { directives[i].handler = handle_disabled_feature; From 42bb446c96cff5ba37fefe2c88d3055c2430f396 Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 16 Oct 2020 11:58:48 +0100 Subject: [PATCH 11/13] conf: shrink back RE_MAX_MATCHES to 16 with the IPv4 regex simplification from 22f059dc5e350b6d18a95265b8d4776d8adb9dca we're back to max 15 match groups according to re2r analysis (the most elaborate regex is the upstream one). --- src/conf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/conf.c b/src/conf.c index ebb1bfd0..b94b71e2 100644 --- a/src/conf.c +++ b/src/conf.c @@ -45,7 +45,7 @@ * number. Given the usual structure of the configuration file, sixteen * substring matches should be plenty. */ -#define RE_MAX_MATCHES 24 +#define RE_MAX_MATCHES 16 #define CP_WARN(FMT, ...) \ log_message (LOG_WARNING, "line %lu: " FMT, lineno, __VA_ARGS__) From 3a920b7163ecc5a2b5115724f9020f798985cbe5 Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 16 Oct 2020 12:03:28 +0100 Subject: [PATCH 12/13] conf: add tool to print regex name/regex pairs as re2r input this is currently not included in the build system and needs to be compiled by hand. --- src/conf_regex_print.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 src/conf_regex_print.c diff --git a/src/conf_regex_print.c b/src/conf_regex_print.c new file mode 100644 index 00000000..31463219 --- /dev/null +++ b/src/conf_regex_print.c @@ -0,0 +1,14 @@ +/* this is a tool to print regexname regex pairs as input for re2r. + compile with gcc -I. src/conf_regex_print.c + */ + +#include "config.h" + +#include + +#define STDCONF(A, B, C) printf("%s %s\n", #A, B) + +int main() { +#include "conf_regex.h" +; +} From 417c258d145928a919e006dd90ecca58ddeed6c7 Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 16 Oct 2020 12:40:56 +0100 Subject: [PATCH 13/13] conf: speed up parsing 10x by using ragel if available conf_regex.rl is generated from the output of conf_regex_print.c using re2r (https://github.com/rofl0r/re2r). if ragel is available on the build host, it is being used to generate finite state machines from the regexes used by the config file parser for an impressive speed boost, while only adding moderately to binary size. a stripped x86_64 tinyproxy binary compiled with -O2 is still only ~100KB. --- configure.ac | 10 + src/Makefile.am | 13 +- src/conf.c | 32 ++- src/conf_regex.rl | 512 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 562 insertions(+), 5 deletions(-) create mode 100644 src/conf_regex.rl diff --git a/configure.ac b/configure.ac index 00f7f0e6..dd912792 100644 --- a/configure.ac +++ b/configure.ac @@ -213,6 +213,16 @@ if test "x$GPERF" != "x" -a "x$GPERF" != "xno" ; then AC_DEFINE(HAVE_GPERF) fi +AC_PATH_PROG(RAGEL, ragel, no) +AM_CONDITIONAL(HAVE_RAGEL, test "x$RAGEL" != "x" -a "x$RAGEL" != "xno") +AH_TEMPLATE([HAVE_RAGEL], + [Whether you have ragel installed for faster config parsing.]) + +if test "x$RAGEL" != "x" -a "x$RAGEL" != "xno" ; then + AC_DEFINE(HAVE_RAGEL) +fi + + AC_CONFIG_FILES([ Makefile src/Makefile diff --git a/src/Makefile.am b/src/Makefile.am index 6d806e03..9fc3b052 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -60,11 +60,22 @@ EXTRA_tinyproxy_SOURCES = filter.c filter.h \ tinyproxy_DEPENDENCIES = @ADDITIONAL_OBJECTS@ tinyproxy_LDADD = @ADDITIONAL_OBJECTS@ -lpthread +CLEANFILES = + if HAVE_GPERF conf-tokens.c: conf-tokens-gperf.inc conf-tokens-gperf.inc: conf-tokens.gperf $(GPERF) $< > $@ endif -EXTRA_DIST = conf-tokens.gperf +if HAVE_RAGEL +conf.c: conf_regex.inc +conf_regex.inc: conf_regex.rl + $(RAGEL) $(RAGEL_FLAGS) -o $@ $< + +CLEANFILES += conf_regex.inc +endif + + +EXTRA_DIST = conf-tokens.gperf conf_regex.rl diff --git a/src/conf.c b/src/conf.c index b94b71e2..58c12160 100644 --- a/src/conf.c +++ b/src/conf.c @@ -142,7 +142,14 @@ static void config_free_regex (void); * do not follow the pattern above. This macro is for convenience * only. */ -#define STDCONF(d, re, func) [CD_ ## d] = { BEGIN re END, func, NULL } +#ifdef HAVE_RAGEL +#define RE2R_EXPORT static +#include "conf_regex.inc" +typedef int (*matchfunc)(const char*, const char*, size_t, regmatch_t[]); +#define STDCONF(d, re, func) [CD_ ## d] = { func, re2r_match_ ## d } +#else +#define STDCONF(d, re, func) [CD_ ## d] = { func, BEGIN re END, NULL } +#endif /* * Holds the regular expression used to match the configuration directive, @@ -151,9 +158,13 @@ static void config_free_regex (void); * to be compiled one. */ struct { - const char *re; CONFFILE_HANDLER handler; +#ifndef HAVE_RAGEL + const char *re; regex_t *cre; +#else + matchfunc mf; +#endif } directives[] = { #include "conf_regex.h" }; @@ -248,6 +259,8 @@ config_init (void) { unsigned int i, r; + (void) r; + for (i = 0; i != ndirectives; ++i) { if (!directives[i].handler) { @@ -255,6 +268,7 @@ config_init (void) continue; } +#ifndef HAVE_RAGEL directives[i].cre = (regex_t *) safemalloc (sizeof (regex_t)); if (!directives[i].cre) return -1; @@ -264,6 +278,7 @@ config_init (void) REG_EXTENDED | REG_NEWLINE); if (r) return r; +#endif } atexit (config_free_regex); @@ -278,6 +293,7 @@ config_init (void) static void config_free_regex (void) { +#ifndef HAVE_RAGEL unsigned int i; for (i = 0; i < ndirectives; i++) { @@ -287,6 +303,7 @@ config_free_regex (void) directives[i].cre = NULL; } } +#endif } /* @@ -297,18 +314,25 @@ config_free_regex (void) * Returns 0 if a match was found and successfully processed; otherwise, * a negative number is returned. */ -static int check_match (struct config_s *conf, const char *line, +static int check_match (struct config_s *conf, + const char *line, const char* lineend, unsigned long lineno, enum config_directive cd) { regmatch_t match[RE_MAX_MATCHES]; unsigned int i = cd; +#ifndef HAVE_RAGEL + (void) lineend; if (!directives[i].cre) return (*directives[i].handler) (conf, line, lineno, match); if (!regexec (directives[i].cre, line, RE_MAX_MATCHES, match, 0)) return (*directives[i].handler) (conf, line, lineno, match); +#else + if (!directives[i].mf(line, lineend, RE_MAX_MATCHES, match)) + return (*directives[i].handler) (conf, line, lineno, match); +#endif return -1; } @@ -335,7 +359,7 @@ static int config_parse (struct config_s *conf, FILE * f) p = q; while(*p && *p != '\n') ++p; while(isspace(*p)) *(p--) = 0; - if (!e || e->value == CD_NIL || check_match (conf, q, lineno, e->value)) { + if (!e || e->value == CD_NIL || check_match (conf, q, ++p, lineno, e->value)) { fprintf (stderr, "ERROR: Syntax error on line %lu\n", lineno); return 1; } diff --git a/src/conf_regex.rl b/src/conf_regex.rl new file mode 100644 index 00000000..aa5c6002 --- /dev/null +++ b/src/conf_regex.rl @@ -0,0 +1,512 @@ +/* automatically generated with re2r by rofl0r */ +%%{ +machine logfile; +action A1 { matches[1].rm_so = p-start; } +action E1 { matches[1].rm_eo = p-start; } +main := '"'([^"]+) >A1 %E1 '"' ; +}%% + +RE2R_EXPORT int re2r_match_logfile(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA1 %E1 ; +}%% + +RE2R_EXPORT int re2r_match_xtinyproxy(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA1 %E1 ; +}%% + +RE2R_EXPORT int re2r_match_port(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA1 %E1 ; +}%% + +RE2R_EXPORT int re2r_match_user(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA2 %E2 |((([0-9a-fA-F:]{2,39}) >A5 %E5 ) >A4 %E4 |(([0-9a-fA-F:]{0,29} ":" ([0-9]+[.][0-9]+[.][0-9]+[.][0-9]+) >A8 %E8 ) >A7 %E7 ) >A6 %E6 ) >A3 %E3 ) >A1 %E1 ; +}%% + +RE2R_EXPORT int re2r_match_listen(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,[2]=1,[3]=1,[4]=3,[5]=4,[6]=3,[7]=6,[8]=7,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA4 %E4 ( "/" [0-9]+)? >A5 %E5 ) >A3 %E3 |(((([0-9a-fA-F:]{2,39}) >A9 %E9 ) >A8 %E8 |(([0-9a-fA-F:]{0,29} ":" ([0-9]+[.][0-9]+[.][0-9]+[.][0-9]+) >A12 %E12 ) >A11 %E11 ) >A10 %E10 ) >A7 %E7 ( "/" [0-9]+)? >A13 %E13 ) >A6 %E6 ) >A2 %E2 |(('-'|[A-Za-z0-9._])+) >A14 %E14 ) >A1 %E1 ; +}%% + +RE2R_EXPORT int re2r_match_allow(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,[2]=1,[3]=2,[4]=3,[5]=3,[6]=2,[7]=6,[8]=7,[9]=8,[10]=7,[11]=10,[12]=11,[13]=6,[14]=1,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA1 %E1 [ \t]+(('-'|[A-Za-z0-9._])+) >A2 %E2 ; +}%% + +RE2R_EXPORT int re2r_match_basicauth(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,[2]=0,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA1 %E1 [ \t]+'"'([^"]+) >A2 %E2 '"' ; +}%% + +RE2R_EXPORT int re2r_match_errorfile(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,[2]=0,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA1 %E1 '"'[ \t]+'"'([^"]+) >A2 %E2 '"' ; +}%% + +RE2R_EXPORT int re2r_match_addheader(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,[2]=0,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA1 %E1 '"'([ \t]+'"'([^"]+) >A3 %E3 '"')? >A2 %E2 ; +}%% + +RE2R_EXPORT int re2r_match_reversepath(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,[2]=0,[3]=2,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA2 %E2 [ \t]+'"'([^"]+) >A3 %E3 '"') >A1 %E1 |(( "http" | "socks4" | "socks5" ) >A5 %E5 [ \t]+(([^:]*) >A7 %E7 ":" ([^@]*) >A8 %E8 "@" )? >A6 %E6 (([0-9]+[.][0-9]+[.][0-9]+[.][0-9]+) >A10 %E10 |(('-'|[A-Za-z0-9._])+) >A11 %E11 ) >A9 %E9 ":" ([0-9]+) >A12 %E12 ([ \t]+'"'([^"]+) >A14 %E14 '"')? >A13 %E13 ) >A4 %E4 ; +}%% + +RE2R_EXPORT int re2r_match_upstream(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,[2]=1,[3]=1,[4]=0,[5]=4,[6]=4,[7]=6,[8]=6,[9]=4,[10]=9,[11]=9,[12]=4,[13]=4,[14]=13,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;iA1 %E1 ; +}%% + +RE2R_EXPORT int re2r_match_loglevel(const char *p, const char* pe, size_t nmatch, regmatch_t matches[]) +{ + size_t i, cs; + int par; + static const unsigned char parents[] = {[0]=0,[1]=0,}; + const char *start = p, *eof = pe; + %% write data nofinal noerror noentry; + for(i=0;i