xaizek / zograscope (License: AGPLv3 only) (since 2018-12-07)
Mainly a syntax-aware diff that also provides a number of additional tools.
<root> / src / align.cpp (c1c8e5c2102a5ff0735bc3af1f68a5195e84e2fc) (7,886B) (mode 100644) [raw]
// Copyright (C) 2018 xaizek <xaizek@posteo.net>
//
// This file is part of zograscope.
//
// zograscope is free software: you can redistribute it and/or modify
// it under the terms of version 3 of the GNU Affero General Public License as
// published by the Free Software Foundation.
//
// zograscope is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with zograscope.  If not, see <http://www.gnu.org/licenses/>.

#include "align.hpp"

#include <algorithm>
#include <deque>
#include <string>
#include <vector>

#include <boost/utility/string_ref.hpp>
#include "dtl/dtl.hpp"

#include "utils/CountIterator.hpp"
#include "utils/strings.hpp"
#include "tree.hpp"

DiffSource::DiffSource(const Node &root)
{
    struct {
        std::vector<LineInfo> &lines;
        std::vector<bool> &modified;
        std::deque<std::string> &storage;

        std::string buffer;
        std::vector<boost::string_ref> spell;
        int line;
        int col;

        void run(const Node &node, bool forceChanged,
                 const Node *n, const Node *relative)
        {
            if (node.next != nullptr) {
                forceChanged |= (node.moved || node.state != State::Unchanged);
                n = (node.next->last ? &node : nullptr);
                relative = (node.relative != nullptr ? node.relative : nullptr);
                return run(*node.next, forceChanged, n, relative);
            }

            if (node.leaf) {
                if (node.line > line) {
                    if (!buffer.empty()) {
                        storage.push_back(buffer);
                        lines.back().text = DiceString(storage.back());
                        buffer.clear();
                    }
                    lines.insert(lines.cend(), node.line - line, LineInfo());
                    modified.insert(modified.cend(), node.line - line, false);
                    line = node.line;
                    col = 1;
                }

                if (node.col > col) {
                    buffer.append(node.col - col, ' ');
                    col = node.col;
                }

                const Node *leafN = (node.relative == nullptr ? n : &node);
                const Node *leafRelative = (node.relative != nullptr)
                                         ? node.relative
                                         : relative;

                spell.clear();
                split(node.spelling, '\n', spell);
                col += spell.front().size();
                buffer.append(spell.front().cbegin(), spell.front().cend());
                lines.back().nodes.push_back(leafN);
                lines.back().rels.push_back(leafRelative);

                const bool changed = forceChanged
                                  || node.moved
                                  || node.state != State::Unchanged;
                modified.back() = (modified.back() || changed);

                for (std::size_t i = 1U; i < spell.size(); ++i) {
                    ++line;
                    col = 1 + spell[i].size();
                    if (!buffer.empty()) {
                        storage.push_back(buffer);
                        lines.back().text = DiceString(storage.back());
                        buffer.clear();
                    }
                    lines.emplace_back(spell[i], leafN, leafRelative);
                    modified.push_back(changed);
                }
            }

            for (Node *child : node.children) {
                run(*child, forceChanged, n, relative);
            }
        }
    } visitor { lines, modified, storage, {}, {}, 0, 1 };

    visitor.run(root, false, nullptr, nullptr);
    if (!visitor.buffer.empty()) {
        storage.push_back(std::move(visitor.buffer));
        lines.back().text = DiceString(storage.back());
    }
}

std::vector<DiffLine>
makeDiff(DiffSource &&l, DiffSource &&r)
{
    using size_type = std::vector<std::string>::size_type;

    std::vector<LineInfo> &lt = l.lines;
    std::vector<LineInfo> &rt = r.lines;

    for (LineInfo &info : lt) {
        std::sort(info.rels.begin(), info.rels.end());
    }
    for (LineInfo &info : rt) {
        std::sort(info.nodes.begin(), info.nodes.end());
    }

    auto cmp = [](LineInfo &a, LineInfo &b) {
        int all = a.rels.size() + b.nodes.size();
        if (all == 0) {
            // Match empty lines.
            return true;
        }

        auto skipNulls = [](std::vector<const Node *> &v) {
            return std::find_if(v.cbegin(), v.cend(), [](const Node *n) {
                return (n != nullptr);
            });
        };

        auto aRels = skipNulls(a.rels);
        auto bRels = skipNulls(b.rels);
        auto bNodes = skipNulls(b.nodes);

        int matched = std::set_intersection(aRels, a.rels.cend(),
                                            bNodes, b.nodes.cend(),
                                            CountIterator()).getCount();
        int total = (a.rels.cend() - aRels) + (b.nodes.cend() - bNodes);
        // XXX: hard-coded thresholds.
        return false
            // Check for matched tokens first.
            || (total != 0 && 2.0f*matched/total >= 0.6f)
            // Check for complete replacement of tokens which look alike a bit.
            || (matched == 0 &&
                aRels == a.rels.cend() && bRels == b.rels.cend() &&
                !a.nodes.empty() && !b.nodes.empty() &&
                a.text.compare(b.text) >= 0.4f)
            // Resort to text based comparison for small total number of tokens
            // unless one of lines contains only removed/added tokens (first
            // condition).
            || ((aRels == a.rels.cend()) == (bRels == b.rels.cend()) &&
                all > 2 && all < 7 && a.text.compare(b.text) >= 0.8f);
    };

    dtl::Diff<LineInfo, std::vector<LineInfo>, decltype(cmp)> diff(lt, rt, cmp);
    diff.compose();

    size_type identicalLines = 0U;
    const size_type minFold = 3;
    const size_type ctxSize = 2;
    std::vector<DiffLine> diffSeq;

    auto foldIdentical = [&](bool last) {
        const size_type startContext =
            (identicalLines == diffSeq.size() ? 0U : ctxSize);
        const size_type endContext = (last ? 0U : ctxSize);
        const size_type context = startContext + endContext;

        if (identicalLines >= context && identicalLines - context > minFold) {
            diffSeq.erase(diffSeq.cend() - (identicalLines - startContext),
                          diffSeq.cend() - endContext);
            diffSeq.emplace(diffSeq.cend() - endContext, Diff::Fold,
                            identicalLines - context);
        }
        identicalLines = 0U;
    };

    auto handleSameLines = [&](size_type i, size_type j) {
        if (!l.modified[i] && !r.modified[j] &&
            lt[i].text.str() == rt[j].text.str()) {
            ++identicalLines;
            diffSeq.emplace_back(Diff::Identical);
        } else {
            foldIdentical(false);
            diffSeq.emplace_back(Diff::Different);
        }
    };

    for (const auto &x : diff.getSes().getSequence()) {
        switch (x.second.type) {
            case dtl::SES_DELETE:
                foldIdentical(false);
                diffSeq.emplace_back(Diff::Left);
                break;
            case dtl::SES_ADD:
                foldIdentical(false);
                diffSeq.emplace_back(Diff::Right);
                break;
            case dtl::SES_COMMON:
                handleSameLines(x.second.beforeIdx - 1, x.second.afterIdx - 1);
                break;
        }
    }

    foldIdentical(true);

    return diffSeq;
}
Hints

Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://code.reversed.top/user/xaizek/zograscope

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@code.reversed.top/user/xaizek/zograscope

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a pull request:
... clone the repository ...
... make some changes and some commits ...
git push origin master