xaizek / zograscope (License: AGPLv3 only) (since 2018-12-07)
Mainly a syntax-aware diff that also provides a number of additional tools.
<root> / third-party / tree-sitter / parsers / bash-scanner.cpp (31231ae1f22b2fdd9ee452a5b5e02c260fec2f54) (10KiB) (mode 100644) [raw]
#include <tree_sitter/parser.h>
#include <string>
#include <cwctype>

namespace {

using std::string;

enum TokenType {
  HEREDOC_START,
  SIMPLE_HEREDOC_BODY,
  HEREDOC_BODY_BEGINNING,
  HEREDOC_BODY_MIDDLE,
  HEREDOC_BODY_END,
  FILE_DESCRIPTOR,
  EMPTY_VALUE,
  CONCAT,
  VARIABLE_NAME,
  REGEX,
  CLOSING_BRACE,
  CLOSING_BRACKET,
  HEREDOC_ARROW,
  HEREDOC_ARROW_DASH,
  NEWLINE,
};

struct Scanner {
  void skip(TSLexer *lexer) {
    lexer->advance(lexer, true);
  }

  void advance(TSLexer *lexer) {
    lexer->advance(lexer, false);
  }

  unsigned serialize(char *buffer) {
    if (heredoc_delimiter.length() + 3 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) return 0;
    buffer[0] = heredoc_is_raw;
    buffer[1] = started_heredoc;
    buffer[2] = heredoc_allows_indent;
    heredoc_delimiter.copy(&buffer[3], heredoc_delimiter.length());
    return heredoc_delimiter.length() + 3;
  }

  void deserialize(const char *buffer, unsigned length) {
    if (length == 0) {
      heredoc_is_raw = false;
      started_heredoc = false;
      heredoc_allows_indent = false;
      heredoc_delimiter.clear();
    } else {
      heredoc_is_raw = buffer[0];
      started_heredoc = buffer[1];
      heredoc_allows_indent = buffer[2];
      heredoc_delimiter.assign(&buffer[3], &buffer[length]);
    }
  }

  bool scan_heredoc_start(TSLexer *lexer) {
    while (iswspace(lexer->lookahead)) skip(lexer);

    lexer->result_symbol = HEREDOC_START;
    heredoc_is_raw = lexer->lookahead == '\'';
    started_heredoc = false;
    heredoc_delimiter.clear();

    if (lexer->lookahead == '\\') {
      advance(lexer);
    }

    int32_t quote = 0;
    if (heredoc_is_raw || lexer->lookahead == '"') {
      quote = lexer->lookahead;
      advance(lexer);
    }

    while (iswalpha(lexer->lookahead) || (quote != 0 && iswspace(lexer->lookahead))) {
      heredoc_delimiter += lexer->lookahead;
      advance(lexer);
    }

    if (lexer->lookahead == quote) {
      advance(lexer);
    }

    return !heredoc_delimiter.empty();
  }

  bool scan_heredoc_end_identifier(TSLexer *lexer) {
    current_leading_word.clear();
    // Scan the first 'n' characters on this line, to see if they match the heredoc delimiter
    while (
      lexer->lookahead != '\0' &&
      lexer->lookahead != '\n' &&
      current_leading_word.length() < heredoc_delimiter.length()
    ) {
      current_leading_word += lexer->lookahead;
      advance(lexer);
    }
    return current_leading_word == heredoc_delimiter;
  }

  bool scan_heredoc_content(TSLexer *lexer, TokenType middle_type, TokenType end_type) {
    bool did_advance = false;

    for (;;) {
      switch (lexer->lookahead) {
        case '\0': {
          if (did_advance) {
            heredoc_is_raw = false;
            started_heredoc = false;
            heredoc_allows_indent = false;
            heredoc_delimiter.clear();
            lexer->result_symbol = end_type;
            return true;
          } else {
            return false;
          }
        }

        case '\\': {
          did_advance = true;
          advance(lexer);
          advance(lexer);
          break;
        }

        case '$': {
          if (heredoc_is_raw) {
            did_advance = true;
            advance(lexer);
            break;
          } else if (did_advance) {
            lexer->result_symbol = middle_type;
            started_heredoc = true;
            return true;
          } else {
            return false;
          }
        }

        case '\n': {
          did_advance = true;
          advance(lexer);
          if (heredoc_allows_indent) {
            while (iswspace(lexer->lookahead)) {
              advance(lexer);
            }
          }
          if (scan_heredoc_end_identifier(lexer)) {
            heredoc_is_raw = false;
            started_heredoc = false;
            heredoc_allows_indent = false;
            heredoc_delimiter.clear();
            lexer->result_symbol = end_type;
            return true;
          }
          break;
        }

        default: {
          did_advance = true;
          advance(lexer);
          break;
        }
      }
    }
  }

  bool scan(TSLexer *lexer, const bool *valid_symbols) {
    if (valid_symbols[CONCAT]) {
      if (!(
        lexer->lookahead == 0 ||
        iswspace(lexer->lookahead) ||
        lexer->lookahead == '\\' ||
        lexer->lookahead == '>' ||
        lexer->lookahead == '<' ||
        lexer->lookahead == ')' ||
        lexer->lookahead == '(' ||
        lexer->lookahead == ';' ||
        lexer->lookahead == '&' ||
        lexer->lookahead == '|' ||
        lexer->lookahead == '`' ||
        lexer->lookahead == '#' ||
        (lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) ||
        (lexer->lookahead == ']' && valid_symbols[CLOSING_BRACKET])
      )) {
        lexer->result_symbol = CONCAT;
        return true;
      }
    }

    if (valid_symbols[EMPTY_VALUE]) {
      if (iswspace(lexer->lookahead)) {
        lexer->result_symbol = EMPTY_VALUE;
        return true;
      }
    }

    if (valid_symbols[HEREDOC_BODY_BEGINNING] && !heredoc_delimiter.empty() && !started_heredoc) {
      return scan_heredoc_content(lexer, HEREDOC_BODY_BEGINNING, SIMPLE_HEREDOC_BODY);
    }

    if (valid_symbols[HEREDOC_BODY_MIDDLE] && !heredoc_delimiter.empty() && started_heredoc) {
      return scan_heredoc_content(lexer, HEREDOC_BODY_MIDDLE, HEREDOC_BODY_END);
    }

    if (valid_symbols[HEREDOC_START]) {
      return scan_heredoc_start(lexer);
    }

    if (valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[HEREDOC_ARROW]) {
      for (;;) {
        if (
          lexer->lookahead == ' ' ||
          lexer->lookahead == '\t' ||
          lexer->lookahead == '\r' ||
          (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])
        ) {
          skip(lexer);
        } else if (lexer->lookahead == '\\') {
          skip(lexer);
          if (lexer->lookahead == '\r') {
            skip(lexer);
          }
          if (lexer->lookahead == '\n') {
            skip(lexer);
          } else {
            return false;
          }
        } else {
          break;
        }
      }

      if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') {
        advance(lexer);
        if (lexer->lookahead == '<') {
          advance(lexer);
          if (lexer->lookahead == '-') {
            advance(lexer);
            heredoc_allows_indent = true;
            lexer->result_symbol = HEREDOC_ARROW_DASH;
          } else if (lexer->lookahead == '<') {
            return false;
          } else {
            heredoc_allows_indent = false;
            lexer->result_symbol = HEREDOC_ARROW;
          }
          return true;
        }
        return false;
      }

      bool is_number = true;
      if (iswdigit(lexer->lookahead)) {
        advance(lexer);
      } else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
        is_number = false;
        advance(lexer);
      } else {
        return false;
      }

      for (;;) {
        if (iswdigit(lexer->lookahead)) {
          advance(lexer);
        } else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
          is_number = false;
          advance(lexer);
        } else {
          break;
        }
      }

      if (is_number &&
          valid_symbols[FILE_DESCRIPTOR] &&
          (lexer->lookahead == '>' || lexer->lookahead == '<')) {
        lexer->result_symbol = FILE_DESCRIPTOR;
        return true;
      }

      if (valid_symbols[VARIABLE_NAME]) {
        if (lexer->lookahead == '+') {
          lexer->mark_end(lexer);
          advance(lexer);
          if (lexer->lookahead == '=') {
            lexer->result_symbol = VARIABLE_NAME;
            return true;
          } else {
            return false;
          }
        } else if (lexer->lookahead == '=' || lexer->lookahead == '[') {
          lexer->result_symbol = VARIABLE_NAME;
          return true;
        }
      }

      return false;
    }

    if (valid_symbols[REGEX]) {
      while (iswspace(lexer->lookahead)) skip(lexer);

      if (
        lexer->lookahead != '"' &&
        lexer->lookahead != '\'' &&
        lexer->lookahead != '$'
      ) {
        struct State {
          bool done;
          uint32_t paren_depth;
          uint32_t bracket_depth;
          uint32_t brace_depth;
        };

        lexer->mark_end(lexer);

        State state = {false, 0, 0, 0};
        while (!state.done) {
          switch (lexer->lookahead) {
            case '\0':
              return false;
            case '(':
              state.paren_depth++;
              break;
            case '[':
              state.bracket_depth++;
              break;
            case '{':
              state.brace_depth++;
              break;
            case ')':
              if (state.paren_depth == 0) state.done = true;
              state.paren_depth--;
              break;
            case ']':
              if (state.bracket_depth == 0) state.done = true;
              state.bracket_depth--;
              break;
            case '}':
              if (state.brace_depth == 0) state.done = true;
              state.brace_depth--;
              break;
          }

          if (!state.done) {
            bool was_space = iswspace(lexer->lookahead);
            advance(lexer);
            if (!was_space) lexer->mark_end(lexer);
          }
        }

        lexer->result_symbol = REGEX;
        return true;
      }
    }

    return false;
  }

  string heredoc_delimiter;
  bool heredoc_is_raw;
  bool started_heredoc;
  bool heredoc_allows_indent;
  string current_leading_word;
};

}

extern "C" {

void *tree_sitter_bash_external_scanner_create() {
  return new Scanner();
}

bool tree_sitter_bash_external_scanner_scan(void *payload, TSLexer *lexer,
                                            const bool *valid_symbols) {
  Scanner *scanner = static_cast<Scanner *>(payload);
  return scanner->scan(lexer, valid_symbols);
}

unsigned tree_sitter_bash_external_scanner_serialize(void *payload, char *state) {
  Scanner *scanner = static_cast<Scanner *>(payload);
  return scanner->serialize(state);
}

void tree_sitter_bash_external_scanner_deserialize(void *payload, const char *state, unsigned length) {
  Scanner *scanner = static_cast<Scanner *>(payload);
  scanner->deserialize(state, length);
}

void tree_sitter_bash_external_scanner_destroy(void *payload) {
  Scanner *scanner = static_cast<Scanner *>(payload);
  delete scanner;
}

}
Hints

Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://code.reversed.top/user/xaizek/zograscope

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@code.reversed.top/user/xaizek/zograscope

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a pull request:
... clone the repository ...
... make some changes and some commits ...
git push origin master