/* source: nestlex.c */ /* Copyright Gerhard Rieger */ /* Published under the GNU General Public License V.2, see file COPYING */ /* a function for lexical scanning of nested character patterns */ #include #include "config.h" #include "mytypes.h" #include "sysincludes.h" static int _nestlex(const char **addr, char **token, ptrdiff_t *len, const char *ends[], const char *hquotes[], const char *squotes[], const char *nests[], bool dropquotes, bool c_esc, bool html_esc ); /* sub: scan a string and copy its value to output string end scanning when an unescaped, unnested string from ends array is found does not copy the end pattern does not write a trailing \0 to token allows escaping with \ and quoting (\ and quotes are removed) allows nesting with div. parens returns -1 if out string was too small returns 1 if addr ended unexpectedly returns 0 if token could be extracted successfully */ int nestlex(const char **addr, /* input string; aft points to end token */ char **token, /* output token; aft points to first unwritten char (caller might want to set it to \0) */ size_t *len, /* remaining bytes in token space (incl. \0) */ const char *ends[], /* list of end strings */ const char *hquotes[],/* list of strings that quote (hard qu.) */ const char *squotes[],/* list of strings that quote softly */ const char *nests[],/* list of strings that start nesting; every second one is matching end */ bool dropquotes, /* drop the outermost quotes */ bool c_esc, /* solve C char escapes: \n \t \0 etc */ bool html_esc /* solve HTML char escapes: %0d %08 etc */ ) { return _nestlex(addr, token, (ptrdiff_t *)len, ends, hquotes, squotes, nests, dropquotes, c_esc, html_esc); } static int _nestlex(const char **addr, char **token, ptrdiff_t *len, const char *ends[], const char *hquotes[], const char *squotes[], const char *nests[], bool dropquotes, bool c_esc, bool html_esc ) { const char *in = *addr; /* pointer into input string */ const char **endx; /* loops over end patterns */ const char **quotx; /* loops over quote patterns */ const char **nestx; /* loops over nest patterns */ char *out = *token; /* pointer into output token */ char c; int i; int result; while (true) { /* is this end of input string? */ if (*in == 0) { break; /* end of string */ } /* first check the end patterns (e.g. for ']') */ endx = ends; i = 0; while (*endx) { if (!strncmp(in, *endx, strlen(*endx))) { /* this end pattern matches */ *addr = in; *token = out; return 0; } ++endx; } /* check for hard quoting pattern */ quotx = hquotes; while (hquotes && *quotx) { if (!strncmp(in, *quotx, strlen(*quotx))) { /* this quote pattern matches */ const char *endnest[2]; if (dropquotes) { /* we strip this quote */ in += strlen(*quotx); } else { for (i = strlen(*quotx); i > 0; --i) { *out++ = *in++; if (--*len <= 0) { *addr = in; *token = out; return -1; } } } /* we call _nestlex recursively */ endnest[0] = *quotx; endnest[1] = NULL; result = _nestlex(&in, &out, len, endnest, NULL/*hquotes*/, NULL/*squotes*/, NULL/*nests*/, false, c_esc, html_esc); if (result == 0 && dropquotes) { /* we strip this quote */ in += strlen(*quotx); } else if (result < 0) { *addr = in; *token = out; return result; } else { /* we copy the trailing quote */ for (i = strlen(*quotx); i > 0; --i) { *out++ = *in++; if (--*len <= 0) { *addr = in; *token = out; return -1; } } } break; } ++quotx; } if (hquotes && *quotx != NULL) { /* there was a quote; string might continue with hard quote */ continue; } /* check for soft quoting pattern */ quotx = squotes; while (squotes && *quotx) { if (!strncmp(in, *quotx, strlen(*quotx))) { /* this quote pattern matches */ /* we strip this quote */ /* we call _nestlex recursively */ const char *endnest[2]; if (dropquotes) { /* we strip this quote */ in += strlen(*quotx); } else { for (i = strlen(*quotx); i > 0; --i) { *out++ = *in++; if (--*len <= 0) { *addr = in; *token = out; return -1; } } } endnest[0] = *quotx; endnest[1] = NULL; result = _nestlex(&in, &out, len, endnest, hquotes, squotes, nests, false, c_esc, html_esc); if (result == 0 && dropquotes) { /* we strip the trailing quote */ in += strlen(*quotx); } else if (result < 0) { *addr = in; *token = out; return result; } else { /* we copy the trailing quote */ for (i = strlen(*quotx); i > 0; --i) { *out++ = *in++; if (--*len <= 0) { *addr = in; *token = out; return -1; } } } break; } ++quotx; } if (squotes && *quotx != NULL) { /* there was a soft quote; string might continue with any quote */ continue; } /* check patterns that start a nested clause */ nestx = nests; i = 0; while (nests && *nestx) { if (!strncmp(in, *nestx, strlen(*nestx))) { /* this nest pattern matches */ const char *endnest[2]; endnest[0] = nestx[1]; endnest[1] = NULL; for (i = strlen(nestx[1]); i > 0; --i) { *out++ = *in++; if (--*len <= 0) { *addr = in; *token = out; return -1; } } result = _nestlex(&in, &out, len, endnest, hquotes, squotes, nests, false, c_esc, html_esc); if (result == 0) { /* copy endnest */ i = strlen(nestx[1]); while (i > 0) { *out++ = *in++; if (--*len <= 0) { *addr = in; *token = out; return -1; } --i; } } else if (result < 0) { *addr = in; *token = out; return result; } break; } nestx += 2; /* skip matching end pattern in table */ } if (nests && *nestx) { /* we handled a nested expression, continue loop */ continue; } /* "normal" data, possibly escaped */ c = *in++; if (c == '\\') { /* found a plain \ escaped part */ c = *in++; if (c == 0) { /* Warn("trailing '\\'");*/ break; } if (c_esc) { /* solve C char escapes: \n \t \0 etc */ switch (c) { case '0': c = '\0'; break; case 'a': c = '\a'; break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; #if LATER case 'x': !!! 1 to 2 hex digits; break; case 'u': !!! 4 hex digits?; break; case 'U': !!! 8 hex digits?; break; #endif default: break; } } *out++ = c; --*len; if (*len <= 0) { *addr = in; *token = out; return -1; /* output overflow */ } continue; } /* just a simple char */ *out++ = c; --*len; if (*len <= 0) { *addr = in; *token = out; return -1; /* output overflow */ } } /* never come here? */ *addr = in; *token = out; return 0; /* OK */ }