sys/shell: refactor tokenizer code
The tokenizer (the code that breaks up the line given to the shell into strings to create argv) was quite a messy piece of code. This commit refactors it into a more traditional state-machine based parser. This fixes the issues with quote handling exposed by the recently introduced test. Co-authored-by: Juan Carrano <j.carrano@fu-berlin.de>
This commit is contained in:
parent
dd2ad603e8
commit
86f60357cf
@ -66,6 +66,27 @@
|
|||||||
#define PROMPT_ON 0
|
#define PROMPT_ON 0
|
||||||
#endif /* SHELL_NO_PROMPT */
|
#endif /* SHELL_NO_PROMPT */
|
||||||
|
|
||||||
|
#define SQUOTE '\''
|
||||||
|
#define DQUOTE '"'
|
||||||
|
#define ESCAPECHAR '\\'
|
||||||
|
#define BLANK ' '
|
||||||
|
|
||||||
|
enum PARSE_STATE {
|
||||||
|
PARSE_SPACE,
|
||||||
|
PARSE_UNQUOTED,
|
||||||
|
PARSE_SINGLEQUOTE,
|
||||||
|
PARSE_DOUBLEQUOTE,
|
||||||
|
PARSE_ESCAPE_MASK,
|
||||||
|
PARSE_UNQUOTED_ESC,
|
||||||
|
PARSE_SINGLEQUOTE_ESC,
|
||||||
|
PARSE_DOUBLEQUOTE_ESC,
|
||||||
|
};
|
||||||
|
|
||||||
|
static enum PARSE_STATE escape_toggle(enum PARSE_STATE s)
|
||||||
|
{
|
||||||
|
return s ^ PARSE_ESCAPE_MASK;
|
||||||
|
}
|
||||||
|
|
||||||
static shell_command_handler_t find_handler(const shell_command_t *command_list, char *command)
|
static shell_command_handler_t find_handler(const shell_command_t *command_list, char *command)
|
||||||
{
|
{
|
||||||
const shell_command_t *command_lists[] = {
|
const shell_command_t *command_lists[] = {
|
||||||
@ -119,107 +140,128 @@ static void print_help(const shell_command_t *command_list)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Break input line into words, create argv and call the command handler.
|
||||||
|
*
|
||||||
|
* Words are broken up at spaces. A backslash escaped the character that comes
|
||||||
|
* after (meaning if it is taken literally and if it is a space it does not break
|
||||||
|
* the word). Spaces can also be protected by quoting with double or single
|
||||||
|
* quotes.
|
||||||
|
*
|
||||||
|
State diagram for the tokenizer:
|
||||||
|
```
|
||||||
|
┌───[\]────┐ ┌─────["]────┐ ┌───[']─────┐ ┌───[\]────┐
|
||||||
|
↓ │ ↓ │ │ ↓ │ ↓
|
||||||
|
┏━━━━━━━━━━┓ ┏━┷━━━━━┓ ┏━┷━━━┷━┓ ┏━━━━┷━━┓ ┏━━━━━━━━━━┓
|
||||||
|
┃DQUOTE ESC┃ ┃DQUOTE ┠───["]─>┃SPACE ┃<─[']──┨SQUOTE ┃ ┃SQUOTE ESC┃
|
||||||
|
┗━━━━━━━━┯━┛ ┗━━━━━━┯┛ ┗┯━━━━┯━┛ ┗━┯━━━━━┛ ┗━━━┯━━━━━━┛
|
||||||
|
│ ↑ │ │ │ │ ↑(store) │
|
||||||
|
│ (store)│ │ ┌─[\]──┘ └──[*]────┐ │ │ │
|
||||||
|
└──[*]──▶┴◀[*]┘ │ │ └[*]▶┴◀──[*]──┘
|
||||||
|
↓ ┏━━━━━━━┓ ↓
|
||||||
|
├◀[\]┨NOQUOTE┃◀─────┼◀─┐
|
||||||
|
│ ┗━━━━━┯━┛(store)↑ │
|
||||||
|
│ │ │ │
|
||||||
|
│ └─[*]─────┘ │
|
||||||
|
│ ┏━━━━━━━━━━━┓ │
|
||||||
|
└───▶┃NOQUOTE ESC┠──[*]──┘
|
||||||
|
┗━━━━━━━━━━━┛
|
||||||
|
```
|
||||||
|
*/
|
||||||
static void handle_input_line(const shell_command_t *command_list, char *line)
|
static void handle_input_line(const shell_command_t *command_list, char *line)
|
||||||
{
|
{
|
||||||
static const char *INCORRECT_QUOTING = "shell: incorrect quoting";
|
static const char *INCORRECT_QUOTING = "shell: incorrect quoting";
|
||||||
|
|
||||||
/* first we need to calculate the number of arguments */
|
/* first we need to calculate the number of arguments */
|
||||||
unsigned argc = 0;
|
int argc = 0;
|
||||||
char *pos = line;
|
char *readpos = line;
|
||||||
int contains_esc_seq = 0;
|
char *writepos = readpos;
|
||||||
while (1) {
|
enum PARSE_STATE pstate = PARSE_SPACE;
|
||||||
if ((unsigned char) *pos > ' ') {
|
|
||||||
/* found an argument */
|
while (*readpos != '\0') {
|
||||||
if (*pos == '"' || *pos == '\'') {
|
switch (pstate) {
|
||||||
/* it's a quoted argument */
|
case PARSE_SPACE:
|
||||||
const char quote_char = *pos;
|
if (*readpos != BLANK) {
|
||||||
do {
|
argc++;
|
||||||
++pos;
|
|
||||||
if (!*pos) {
|
|
||||||
puts(INCORRECT_QUOTING);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
else if (*pos == '\\') {
|
|
||||||
/* skip over the next character */
|
|
||||||
++contains_esc_seq;
|
|
||||||
++pos;
|
|
||||||
if (!*pos) {
|
|
||||||
puts(INCORRECT_QUOTING);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
} while (*pos != quote_char);
|
|
||||||
if ((unsigned char) pos[1] > ' ') {
|
|
||||||
puts(INCORRECT_QUOTING);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
}
|
if (*readpos == SQUOTE) {
|
||||||
else {
|
pstate = PARSE_SINGLEQUOTE;
|
||||||
/* it's an unquoted argument */
|
}
|
||||||
do {
|
else if (*readpos == DQUOTE) {
|
||||||
if (*pos == '\\') {
|
pstate = PARSE_DOUBLEQUOTE;
|
||||||
/* skip over the next character */
|
}
|
||||||
++contains_esc_seq;
|
else if (*readpos == ESCAPECHAR) {
|
||||||
++pos;
|
pstate = PARSE_UNQUOTED_ESC;
|
||||||
if (!*pos) {
|
}
|
||||||
puts(INCORRECT_QUOTING);
|
else if (*readpos != BLANK) {
|
||||||
return;
|
pstate = PARSE_UNQUOTED;
|
||||||
}
|
break;
|
||||||
}
|
}
|
||||||
++pos;
|
goto parse_end;
|
||||||
if (*pos == '"') {
|
|
||||||
puts(INCORRECT_QUOTING);
|
case PARSE_UNQUOTED:
|
||||||
return;
|
if (*readpos == BLANK) {
|
||||||
}
|
pstate = PARSE_SPACE;
|
||||||
} while ((unsigned char) *pos > ' ');
|
*writepos++ = '\0';
|
||||||
}
|
goto parse_end;
|
||||||
|
}
|
||||||
|
else if (*readpos == ESCAPECHAR) {
|
||||||
|
pstate = escape_toggle(pstate);
|
||||||
|
goto parse_end;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
/* count the number of arguments we got */
|
case PARSE_SINGLEQUOTE:
|
||||||
++argc;
|
if (*readpos == SQUOTE) {
|
||||||
}
|
pstate = PARSE_SPACE;
|
||||||
|
*writepos++ = '\0';
|
||||||
|
goto parse_end;
|
||||||
|
}
|
||||||
|
else if (*readpos == ESCAPECHAR) {
|
||||||
|
pstate = escape_toggle(pstate);
|
||||||
|
goto parse_end;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
/* zero out current position (space or quotation mark) and advance */
|
case PARSE_DOUBLEQUOTE:
|
||||||
if (*pos > 0) {
|
if (*readpos == DQUOTE) {
|
||||||
*pos = 0;
|
pstate = PARSE_SPACE;
|
||||||
++pos;
|
*writepos++ = '\0';
|
||||||
}
|
goto parse_end;
|
||||||
else {
|
}
|
||||||
break;
|
else if (*readpos == ESCAPECHAR) {
|
||||||
|
pstate = escape_toggle(pstate);
|
||||||
|
goto parse_end;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default: /* QUOTED state */
|
||||||
|
pstate = escape_toggle(pstate);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
*writepos++ = *readpos;
|
||||||
|
parse_end:
|
||||||
|
readpos++;
|
||||||
}
|
}
|
||||||
if (!argc) {
|
*writepos = '\0';
|
||||||
|
|
||||||
|
if (pstate != PARSE_SPACE && pstate != PARSE_UNQUOTED) {
|
||||||
|
puts(INCORRECT_QUOTING);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* then we fill the argv array */
|
/* then we fill the argv array */
|
||||||
char *argv[argc + 1];
|
int collected;
|
||||||
argv[argc] = NULL;
|
char *argv[argc];
|
||||||
pos = line;
|
|
||||||
for (unsigned i = 0; i < argc; ++i) {
|
readpos = line;
|
||||||
while (!*pos) {
|
for (collected = 0; collected < argc; collected++) {
|
||||||
++pos;
|
argv[collected] = readpos;
|
||||||
}
|
readpos += strlen(readpos) + 1;
|
||||||
if (*pos == '"' || *pos == '\'') {
|
|
||||||
++pos;
|
|
||||||
}
|
|
||||||
argv[i] = pos;
|
|
||||||
while (*pos) {
|
|
||||||
++pos;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (char **arg = argv; contains_esc_seq && *arg; ++arg) {
|
|
||||||
for (char *c = *arg; *c; ++c) {
|
|
||||||
if (*c != '\\') {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (char *d = c; *d; ++d) {
|
|
||||||
*d = d[1];
|
|
||||||
}
|
|
||||||
if (--contains_esc_seq == 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* then we call the appropriate handler */
|
/* then we call the appropriate handler */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user