sys/shell: refactor tokenizer code

The tokenizer (the code that breaks up the line given to the shell into
strings to create argv) was quite a messy piece of code. This commit
refactors it into a more traditional state-machine based parser.

This fixes the issues with quote handling exposed by the recently
introduced test.

Co-authored-by: Juan Carrano <j.carrano@fu-berlin.de>
This commit is contained in:
Hendrik van Essen 2020-02-08 15:00:27 +01:00
parent dd2ad603e8
commit 86f60357cf

View File

@ -66,6 +66,27 @@
#define PROMPT_ON 0 #define PROMPT_ON 0
#endif /* SHELL_NO_PROMPT */ #endif /* SHELL_NO_PROMPT */
#define SQUOTE '\''
#define DQUOTE '"'
#define ESCAPECHAR '\\'
#define BLANK ' '
enum PARSE_STATE {
PARSE_SPACE,
PARSE_UNQUOTED,
PARSE_SINGLEQUOTE,
PARSE_DOUBLEQUOTE,
PARSE_ESCAPE_MASK,
PARSE_UNQUOTED_ESC,
PARSE_SINGLEQUOTE_ESC,
PARSE_DOUBLEQUOTE_ESC,
};
static enum PARSE_STATE escape_toggle(enum PARSE_STATE s)
{
return s ^ PARSE_ESCAPE_MASK;
}
static shell_command_handler_t find_handler(const shell_command_t *command_list, char *command) static shell_command_handler_t find_handler(const shell_command_t *command_list, char *command)
{ {
const shell_command_t *command_lists[] = { const shell_command_t *command_lists[] = {
@ -119,107 +140,128 @@ static void print_help(const shell_command_t *command_list)
} }
} }
/**
* Break input line into words, create argv and call the command handler.
*
* Words are broken up at spaces. A backslash escaped the character that comes
* after (meaning if it is taken literally and if it is a space it does not break
* the word). Spaces can also be protected by quoting with double or single
* quotes.
*
State diagram for the tokenizer:
```
[\] ["]────┐ ┌───[']─────┐ ┌───[\]────┐
DQUOTE ESC DQUOTE ["]─>┃SPACE ┃<─[']──┨SQUOTE ┃ ┃SQUOTE ESC┃
(store)
(store) [\] [*]
[*][*] [*][*]
[\]NOQUOTE
(store)
[*]
NOQUOTE ESC[*]
```
*/
static void handle_input_line(const shell_command_t *command_list, char *line) static void handle_input_line(const shell_command_t *command_list, char *line)
{ {
static const char *INCORRECT_QUOTING = "shell: incorrect quoting"; static const char *INCORRECT_QUOTING = "shell: incorrect quoting";
/* first we need to calculate the number of arguments */ /* first we need to calculate the number of arguments */
unsigned argc = 0; int argc = 0;
char *pos = line; char *readpos = line;
int contains_esc_seq = 0; char *writepos = readpos;
while (1) { enum PARSE_STATE pstate = PARSE_SPACE;
if ((unsigned char) *pos > ' ') {
/* found an argument */ while (*readpos != '\0') {
if (*pos == '"' || *pos == '\'') { switch (pstate) {
/* it's a quoted argument */ case PARSE_SPACE:
const char quote_char = *pos; if (*readpos != BLANK) {
do { argc++;
++pos;
if (!*pos) {
puts(INCORRECT_QUOTING);
return;
}
else if (*pos == '\\') {
/* skip over the next character */
++contains_esc_seq;
++pos;
if (!*pos) {
puts(INCORRECT_QUOTING);
return;
}
continue;
}
} while (*pos != quote_char);
if ((unsigned char) pos[1] > ' ') {
puts(INCORRECT_QUOTING);
return;
} }
} if (*readpos == SQUOTE) {
else { pstate = PARSE_SINGLEQUOTE;
/* it's an unquoted argument */ }
do { else if (*readpos == DQUOTE) {
if (*pos == '\\') { pstate = PARSE_DOUBLEQUOTE;
/* skip over the next character */ }
++contains_esc_seq; else if (*readpos == ESCAPECHAR) {
++pos; pstate = PARSE_UNQUOTED_ESC;
if (!*pos) { }
puts(INCORRECT_QUOTING); else if (*readpos != BLANK) {
return; pstate = PARSE_UNQUOTED;
} break;
} }
++pos; goto parse_end;
if (*pos == '"') {
puts(INCORRECT_QUOTING); case PARSE_UNQUOTED:
return; if (*readpos == BLANK) {
} pstate = PARSE_SPACE;
} while ((unsigned char) *pos > ' '); *writepos++ = '\0';
} goto parse_end;
}
else if (*readpos == ESCAPECHAR) {
pstate = escape_toggle(pstate);
goto parse_end;
}
break;
/* count the number of arguments we got */ case PARSE_SINGLEQUOTE:
++argc; if (*readpos == SQUOTE) {
} pstate = PARSE_SPACE;
*writepos++ = '\0';
goto parse_end;
}
else if (*readpos == ESCAPECHAR) {
pstate = escape_toggle(pstate);
goto parse_end;
}
break;
/* zero out current position (space or quotation mark) and advance */ case PARSE_DOUBLEQUOTE:
if (*pos > 0) { if (*readpos == DQUOTE) {
*pos = 0; pstate = PARSE_SPACE;
++pos; *writepos++ = '\0';
} goto parse_end;
else { }
break; else if (*readpos == ESCAPECHAR) {
pstate = escape_toggle(pstate);
goto parse_end;
}
break;
default: /* QUOTED state */
pstate = escape_toggle(pstate);
break;
} }
*writepos++ = *readpos;
parse_end:
readpos++;
} }
if (!argc) { *writepos = '\0';
if (pstate != PARSE_SPACE && pstate != PARSE_UNQUOTED) {
puts(INCORRECT_QUOTING);
return;
}
if (argc == 0) {
return; return;
} }
/* then we fill the argv array */ /* then we fill the argv array */
char *argv[argc + 1]; int collected;
argv[argc] = NULL; char *argv[argc];
pos = line;
for (unsigned i = 0; i < argc; ++i) { readpos = line;
while (!*pos) { for (collected = 0; collected < argc; collected++) {
++pos; argv[collected] = readpos;
} readpos += strlen(readpos) + 1;
if (*pos == '"' || *pos == '\'') {
++pos;
}
argv[i] = pos;
while (*pos) {
++pos;
}
}
for (char **arg = argv; contains_esc_seq && *arg; ++arg) {
for (char *c = *arg; *c; ++c) {
if (*c != '\\') {
continue;
}
for (char *d = c; *d; ++d) {
*d = d[1];
}
if (--contains_esc_seq == 0) {
break;
}
}
} }
/* then we call the appropriate handler */ /* then we call the appropriate handler */