xaizek / zograscope (License: AGPLv3 only) (since 2018-12-07)
Mainly a syntax-aware diff that also provides a number of additional tools.
<root> / src / srcml / SrcmlTransformer.cpp (49fe677d77d05acb7edf84fee52233c895e9cf7b) (10KiB) (mode 100644) [raw]
// Copyright (C) 2017 xaizek <xaizek@posteo.net>
//
// This file is part of zograscope.
//
// zograscope is free software: you can redistribute it and/or modify
// it under the terms of version 3 of the GNU Affero General Public License as
// published by the Free Software Foundation.
//
// zograscope is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with zograscope.  If not, see <http://www.gnu.org/licenses/>.

#include "SrcmlTransformer.hpp"

#include <cstdlib>

#include <fstream>
#include <stdexcept>
#include <string>

#include <boost/algorithm/string/predicate.hpp>
#include <boost/utility/string_ref.hpp>
#include "tinyxml2/tinyxml2.h"

#ifdef HAVE_LIBSRCML
#include <srcml.h>
#endif

#include "utils/fs.hpp"
#include "utils/strings.hpp"
#include "TreeBuilder.hpp"
#include "integration.hpp"
#include "types.hpp"

namespace ti = tinyxml2;

static void toSrcmlForm(const std::string &contents,
                        const std::string &path,
                        const std::string &language,
                        ti::XMLDocument &doc);
static bool toLibSrcmlForm(const std::string &contents,
                           const std::string &language,
                           ti::XMLDocument &doc);
static boost::string_ref processValue(boost::string_ref str);

SrcmlTransformer::SrcmlTransformer(const std::string &contents,
                                   const std::string &path,
                                   TreeBuilder &tb,
                                   const std::string &language,
                              const std::unordered_map<std::string, SType> &map,
                                const std::unordered_set<std::string> &keywords,
                                   int tabWidth)
    : contents(contents), path(path), tb(tb), language(language),
      map(map), keywords(keywords), tabWidth(tabWidth)
{ }

void
SrcmlTransformer::transform()
{
    ti::XMLDocument doc;
    toSrcmlForm(contents, path, language, doc);
    if (doc.Error()) {
        throw std::runtime_error("Failed to parse: " +
                                 std::string(doc.ErrorStr()));
    }

    left = contents;
    line = 1;
    col = 1;
    inCppDirective = 0;

    tb.setRoot(visit(doc.RootElement(), 0));
}

// Parses source file via SrcML and gets result in a form of XML DOM.
static void toSrcmlForm(const std::string &contents,
                        const std::string &path,
                        const std::string &language,
                        ti::XMLDocument &doc)
{
    if (toLibSrcmlForm(contents, language, doc)) {
        return;
    }
#ifdef HAVE_LIBSRCML
    throw std::runtime_error("Failed to parse " + path);
#endif

    TempFile tmpFile(path);

    std::ofstream ofs(tmpFile);
    if (!ofs) {
        throw std::runtime_error("Failed to open temporary file: " +
                                 tmpFile.str());
    }
    ofs << contents;
    ofs.close();

    std::vector<std::string> cmd = {
        "srcml", "--language=" + language, "--src-encoding=utf8", tmpFile
    };

    std::string xml = readCommandOutput(cmd, std::string());

    if (doc.Parse(xml.data(), xml.size()) == ti::XML_SUCCESS &&
        doc.RootElement() == nullptr) {

        // Work around srcml's issues with parsing files.  Sometimes it can't
        // read them from file (when extension is ".z" it thinks it's an
        // archive) and sometimes from stdin (bugs of previous versions), so try
        // both ways.
        cmd.pop_back();
        xml = readCommandOutput(cmd, contents);
        (void)doc.Parse(xml.data(), xml.size());
    }
}

template <typename T, typename D>
std::unique_ptr<T, D>
asUniquePtr(T *p, D &&d)
{
    return std::unique_ptr<T, D>(p, std::forward<D>(d));
}

static bool
toLibSrcmlForm(const std::string &contents,
               const std::string &language,
               ti::XMLDocument &doc)
{
#ifdef HAVE_LIBSRCML
    auto archive = asUniquePtr(srcml_archive_create(), &srcml_archive_free);
    if (archive == nullptr) {
        return false;
    }

    int status;

    status = srcml_archive_set_src_encoding(archive.get(), "utf8");
    if (status != SRCML_STATUS_OK) {
        return false;
    }

    char *buffer;
    size_t size;

    status = srcml_archive_write_open_memory(archive.get(), &buffer, &size);
    if (status != SRCML_STATUS_OK) {
        return false;
    }

    auto unit = asUniquePtr(srcml_unit_create(archive.get()), &srcml_unit_free);
    if (unit == nullptr) {
        return false;
    }

    // single input file, so non-archive unit
    status = srcml_archive_enable_solitary_unit(archive.get());
    if (status != SRCML_STATUS_OK) {
        return false;
    }

    status = srcml_unit_set_language(unit.get(), language.c_str());
    if (status != SRCML_STATUS_OK) {
        return false;
    }

    status = srcml_unit_parse_memory(unit.get(),
                                     contents.data(),
                                     contents.size());
    if (status != SRCML_STATUS_OK) {
        return false;
    }

    status = srcml_archive_write_unit(archive.get(), unit.get());
    if (status != SRCML_STATUS_OK) {
        return false;
    }

    srcml_archive_close(archive.get());
    auto bufGuard = asUniquePtr(buffer, &std::free);

    (void)doc.Parse(buffer, size);
    return true;
#else
    (void)contents, (void)language, (void)doc;
    return false;
#endif
}

PNode *
SrcmlTransformer::visit(ti::XMLNode *node, int level)
{
    if (node == nullptr) {
        return tb.addNode();
    }

    SType stype = {};
    auto it = map.find(node->Value());
    if (it != map.end()) {
        stype = it->second;
    }

    bool cppDirective = boost::starts_with(node->Value(), "cpp:");
    inCppDirective += cppDirective;

    PNode *pnode = tb.addNode({}, stype);

    for (ti::XMLNode *child = node->FirstChild();
         child != nullptr;
         child = child->NextSibling()) {
        if (ti::XMLElement *e = child->ToElement()) {
            tb.append(pnode, visit(e, level + 1));
        } else if (child->ToText() != nullptr) {
            visitLeaf(node, pnode, child);
        }
    }

    inCppDirective -= cppDirective;

    return pnode;
}

void
SrcmlTransformer::visitLeaf(ti::XMLNode *parent, PNode *pnode,
                            ti::XMLNode *leaf)
{
    boost::string_ref fullVal = processValue(leaf->Value());

    SType stype = {};
    if (leaf != parent->FirstChild() || leaf->NextSibling() != nullptr) {
        stype = map.at("separator");
    }

    std::vector<boost::string_ref> vals = { fullVal };
    if (fullVal.size() > 1U &&
        (fullVal.back() == ':' || fullVal.back() == ';')) {
        vals = {
            processValue(fullVal.substr(0U, fullVal.size() - 1U)),
            processValue(fullVal.substr(fullVal.size() - 1U))
        };
    }

    for (boost::string_ref val : vals) {
        const std::size_t skipped = left.find(val);
        updatePosition(left.substr(0U, skipped), tabWidth, line, col);
        left.remove_prefix(skipped);

        const Type type = determineType(parent->ToElement(), val);

        const auto offset = static_cast<std::uint32_t>(&left[0] - &contents[0]);
        const auto len = static_cast<std::uint32_t>(val.size());
        tb.append(pnode,
                  tb.addNode(Text{offset, len, 0, 0, static_cast<int>(type)},
                             Location{line, col, 0, 0}, stype));

        updatePosition(left.substr(0U, len), tabWidth, line, col);
        left.remove_prefix(len);
    }
}

// Trims the value.
static boost::string_ref
processValue(boost::string_ref str)
{
    auto first = str.find_first_not_of(" \t\n");
    if (first != boost::string_ref::npos) {
        str.remove_prefix(first);
    }

    auto last = str.find_last_not_of(" \t\n");
    if (last != boost::string_ref::npos) {
        str.remove_suffix(str.size() - (last + 1));
    }

    return str;
}

Type
SrcmlTransformer::determineType(ti::XMLElement *elem, boost::string_ref value)
{
    boost::string_ref elemValue = elem->Value();
    if (elemValue == "literal") {
        const std::string type = elem->Attribute("type");
        if (type == "boolean") {
            return Type::IntConstants;
        } else if (type == "char") {
            return Type::CharConstants;
        } else if (type == "null") {
            return Type::IntConstants;
        } else if (type == "number") {
            return Type::IntConstants;
        } else if (type == "string") {
            return Type::StrConstants;
        } else if (type == "complex") {
            return Type::FPConstants;
        }
    } else if (elemValue == "specifier") {
        return Type::Specifiers;
    } else if (elemValue == "comment") {
        return Type::Comments;
    } else if (inCppDirective) {
        return Type::Directives;
    } else if (value[0] == '(' || value[0] == '{' || value[0] == '[') {
        return Type::LeftBrackets;
    } else if (value[0] == ')' || value[0] == '}' || value[0] == ']') {
        return Type::RightBrackets;
    } else if (elemValue == "operator") {
        return Type::Operators;
    } else if (elemValue == "name") {
        const ti::XMLNode *parent = elem;
        std::string parentValue;
        std::string grandParentValue;
        do {
            parent = parent->Parent();
            parentValue = (parent == nullptr ? "" : parent->Value());
            grandParentValue = (parent->Parent() == nullptr)
                             ? ""
                             : parent->Parent()->Value();
        } while (parentValue == "name" &&
                 (elem != parent->FirstChild() ||
                  (grandParentValue != "function" &&
                   grandParentValue != "call")));

        if (keywords.find(value.to_string()) != keywords.cend()) {
            return Type::Keywords;
        }
        if (parentValue == "type") {
            return Type::UserTypes;
        }
        if (parentValue == "function" || parentValue == "call") {
            return Type::Functions;
        }
        return Type::Identifiers;
    } else if (keywords.find(value.to_string()) != keywords.cend()) {
        return Type::Keywords;
    }
    return Type::Other;
}
Hints

Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://code.reversed.top/user/xaizek/zograscope

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@code.reversed.top/user/xaizek/zograscope

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a pull request:
... clone the repository ...
... make some changes and some commits ...
git push origin master