sys/shell: further refactor tokenizer (part 2/2)

Code now correctly handles quotes within PARSE_UNQUOTED
and tabs are now considered a BLANK just like a space.
This commit is contained in:
Hendrik van Essen 2020-06-17 14:48:14 +02:00
parent 0782b493ed
commit cc759ebcca

View File

@ -69,20 +69,24 @@
#define SQUOTE '\'' #define SQUOTE '\''
#define DQUOTE '"' #define DQUOTE '"'
#define ESCAPECHAR '\\' #define ESCAPECHAR '\\'
#define BLANK ' ' #define SPACE ' '
#define TAB '\t'
enum PARSE_STATE { #define PARSE_ESCAPE_MASK 0x4;
PARSE_SPACE,
PARSE_UNQUOTED, enum parse_state {
PARSE_SINGLEQUOTE, PARSE_BLANK = 0x0,
PARSE_DOUBLEQUOTE,
PARSE_ESCAPE_MASK, PARSE_UNQUOTED = 0x1,
PARSE_UNQUOTED_ESC, PARSE_SINGLEQUOTE = 0x2,
PARSE_SINGLEQUOTE_ESC, PARSE_DOUBLEQUOTE = 0x3,
PARSE_DOUBLEQUOTE_ESC,
PARSE_UNQUOTED_ESC = 0x5,
PARSE_SINGLEQUOTE_ESC = 0x6,
PARSE_DOUBLEQUOTE_ESC = 0x7,
}; };
static enum PARSE_STATE escape_toggle(enum PARSE_STATE s) static enum parse_state escape_toggle(enum parse_state s)
{ {
return s ^ PARSE_ESCAPE_MASK; return s ^ PARSE_ESCAPE_MASK;
} }
@ -136,30 +140,56 @@ static void print_help(const shell_command_t *command_list)
/** /**
* Break input line into words, create argv and call the command handler. * Break input line into words, create argv and call the command handler.
* *
* Words are broken up at spaces. A backslash escaped the character that comes * Words are broken up at spaces. A backslash escapes the character that comes
* after (meaning if it is taken literally and if it is a space it does not break * after (meaning if it is taken literally and if it is a space it does not break
* the word). Spaces can also be protected by quoting with double or single * the word). Spaces can also be protected by quoting with double or single
* quotes. * quotes.
* *
State diagram for the tokenizer: * There are two unquoted states (PARSE_BLANK and PARSE_UNQUOTED) and two quoted
``` * states (PARSE_SINGLEQUOTE and PARSE_DOUBLEQUOTE). In addition, every state
[\] ["]────┐ ┌───[']─────┐ ┌───[\]────┐ * (except PARSE_BLANK) has an escaped pair state (e.g PARSE_SINGLEQUOTE and
* PARSE_SINGLEQUOTE_ESC).
*
DQUOTE ESC DQUOTE ["]─>┃SPACE ┃<─[']──┨SQUOTE ┃ ┃SQUOTE ESC┃ * For the following let's define some things
* - Function transit(character, state) to change to 'state' after
(store) * 'character' was read. The order of a list of transit-functions matters.
(store) [\] [*] * - A BLANK is either SPACE or TAB
[*][*] [*][*] * - '*' means any character
*
[\]NOQUOTE * PARSE_BLANK
(store) * transit(SQUOTE, PARSE_SINGLEQUOTE)
* transit(DQUOTE, PARSE_DOUBLEQUOTE)
[*] * transit(ESCAPECHAR, PARSE_UNQUOTED_ESC)
* transit(BLANK, PARSE_BLANK)
NOQUOTE ESC[*] * transit(*, PARSE_UNQUOTED) -> store character
*
``` * PARSE_UNQUOTED
* transit(SQUOTE, PARSE_SINGLEQUOTE)
* transit(DQUOTE, PARSE_DOUBLEQUOTE)
* transit(BLANK, PARSE_BLANK)
* transit(ESCAPECHAR, PARSE_UNQUOTED_ESC)
* transit(*, PARSE_UNQUOTED) -> store character
*
* PARSE_UNQUOTED_ESC
* transit(*, PARSE_UNQUOTED) -> store character
*
* PARSE_SINGLEQUOTE
* transit(SQUOTE, PARSE_UNQUOTED)
* transit(ESCAPECHAR, PARSE_SINGLEQUOTE_ESC)
* transit(*, PARSE_SINGLEQUOTE) -> store character
*
* PARSE_SINGLEQUOTE_ESC
* transit(*, PARSE_SINGLEQUOTE) -> store character
*
* PARSE_DOUBLEQUOTE
* transit(DQUOTE, PARSE_UNQUOTED)
* transit(ESCAPECHAR, PARSE_DOUBLEQUOTE_ESC)
* transit(*, PARSE_DOUBLEQUOTE) -> store character
*
* PARSE_DOUBLEQUOTE_ESC
* transit(*, PARSE_DOUBLEQUOTE) -> store character
*
*
*/ */
static void handle_input_line(const shell_command_t *command_list, char *line) static void handle_input_line(const shell_command_t *command_list, char *line)
{ {
@ -167,17 +197,18 @@ static void handle_input_line(const shell_command_t *command_list, char *line)
int argc = 0; int argc = 0;
char *readpos = line; char *readpos = line;
char *writepos = readpos; char *writepos = readpos;
enum PARSE_STATE pstate = PARSE_SPACE;
uint8_t pstate = PARSE_BLANK;
for (; *readpos != '\0'; readpos++) { for (; *readpos != '\0'; readpos++) {
char wordbreak = BLANK; char wordbreak = SPACE;
bool is_wordbreak = false; bool is_wordbreak = false;
switch (pstate) { switch (pstate) {
case PARSE_SPACE: case PARSE_BLANK:
if (*readpos != BLANK) { if (*readpos != SPACE && *readpos != TAB) {
argc++; argc++;
} }
@ -190,15 +221,29 @@ static void handle_input_line(const shell_command_t *command_list, char *line)
else if (*readpos == ESCAPECHAR) { else if (*readpos == ESCAPECHAR) {
pstate = PARSE_UNQUOTED_ESC; pstate = PARSE_UNQUOTED_ESC;
} }
else if (*readpos != BLANK) { else if (*readpos != SPACE && *readpos != TAB) {
pstate = PARSE_UNQUOTED; pstate = PARSE_UNQUOTED;
*writepos++ = *readpos; *writepos++ = *readpos;
} }
break; break;
case PARSE_UNQUOTED: case PARSE_UNQUOTED:
wordbreak = BLANK; if (*readpos == SQUOTE) {
is_wordbreak = true; pstate = PARSE_SINGLEQUOTE;
}
else if (*readpos == DQUOTE) {
pstate = PARSE_DOUBLEQUOTE;
}
else if (*readpos == ESCAPECHAR) {
pstate = escape_toggle(pstate);
}
else if (*readpos == SPACE || *readpos == TAB) {
pstate = PARSE_BLANK;
*writepos++ = '\0';
}
else {
*writepos++ = *readpos;
}
break; break;
case PARSE_SINGLEQUOTE: case PARSE_SINGLEQUOTE:
@ -219,8 +264,9 @@ static void handle_input_line(const shell_command_t *command_list, char *line)
if (is_wordbreak) { if (is_wordbreak) {
if (*readpos == wordbreak) { if (*readpos == wordbreak) {
pstate = PARSE_SPACE; if (wordbreak == SQUOTE || wordbreak == DQUOTE) {
*writepos++ = '\0'; pstate = PARSE_UNQUOTED;
}
} }
else if (*readpos == ESCAPECHAR) { else if (*readpos == ESCAPECHAR) {
pstate = escape_toggle(pstate); pstate = escape_toggle(pstate);
@ -232,7 +278,7 @@ static void handle_input_line(const shell_command_t *command_list, char *line)
} }
*writepos = '\0'; *writepos = '\0';
if (pstate != PARSE_SPACE && pstate != PARSE_UNQUOTED) { if (pstate != PARSE_BLANK && pstate != PARSE_UNQUOTED) {
puts("shell: incorrect quoting"); puts("shell: incorrect quoting");
return; return;
} }