xaizek / zograscope (License: AGPLv3 only) (since 2018-12-07)
Mainly a syntax-aware diff that also provides a number of additional tools.
<root> / src / tree-edit-distance.cpp (49a36de0d2baf471f990d08b19bd0c6c54acf1fc) (11KiB) (mode 100644) [raw]
// Copyright (C) 2017 xaizek <xaizek@posteo.net>
//
// This file is part of zograscope.
//
// zograscope is free software: you can redistribute it and/or modify
// it under the terms of version 3 of the GNU Affero General Public License as
// published by the Free Software Foundation.
//
// zograscope is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with zograscope.  If not, see <http://www.gnu.org/licenses/>.

#include "tree-edit-distance.hpp"

#define BOOST_DISABLE_ASSERTS
#include <boost/multi_array.hpp>

#include <algorithm>
#include <iomanip>
#include <iostream>
#include <map>
#include <memory>
#include <string>
#include <vector>

#include "tree.hpp"

struct Change
{
    int cost;
    int i, j;
};

enum { Wdel = 1, Wins = 1, Wren = 1, Wch = 3 };

static void
lmld(Node &node, std::vector<int> &l)
{
    if (node.satellite) {
        return;
    }

    int satelliteCount = 0;
    for (Node *child : node.children) {
        if (!child->satellite) {
            ++satelliteCount;
            if (satelliteCount == 1) {
                l[node.poID] = l[child->poID];
            }
            lmld(*child, l);
        }
    }

    if (satelliteCount == 0) {
        l[node.poID] = node.poID;
    }
}

static std::vector<int>
lmld(Node &root)
{
    // post-order ID of root is equal to number of nodes - 1
    std::vector<int> l(root.poID + 1);
    lmld(root, l);
    return l;
}

static int
countLeaves(Node &node)
{
    auto notSatellite = [](const Node *n) { return !n->satellite; };

    int n = std::count_if(node.children.cbegin(), node.children.cend(),
                          notSatellite) == 0;
    for (Node *child : node.children) {
        if (!child->satellite) {
            n += countLeaves(*child);
        }
    }
    return n;
}

static std::vector<int>
makeKr(Node &root, std::vector<int> &l)
{
    int k = countLeaves(root);

    std::vector<bool> visited(l.size());

    std::vector<int> kr;
    kr.reserve(k);

    int i = l.size() - 1U;
    while (k >= 1) {
        if (!visited[l[i]]) {
            kr.push_back(i);
            --k;
            visited[l[i]] = true;
        }
        --i;
    }

    std::sort(kr.begin(), kr.end());
    return kr;
}

void
printTree(const std::string &name, Tree &tree)
{
    std::cout << name << '\n';
    std::cout << std::setw(name.length())
              << std::setfill('=') << "" << std::setfill(' ') << '\n';
    tree.dump();

    std::cout << '\n';

    std::vector<Node *> po = postOrder(*tree.getRoot());
    for (Node *node : po) {
        std::cout << "po(" << node->label << ") = " << node->poID + 1 << '\n';
    }

    std::cout << '\n';

    std::vector<int> l = lmld(*tree.getRoot());
    for (unsigned int i = 0U; i < l.size(); ++i) {
        std::cout << "l(" << po[i]->label << ") = " << l[i] + 1 << '\n';
    }

    std::cout << '\n';

    std::vector<int> kr = makeKr(*tree.getRoot(), l);
    for (unsigned int i = 0U; i < kr.size(); ++i) {
        std::cout << "k[" << i << "] = " << kr[i] + 1 << '\n';
    }
}

static int
renameCost(const Node *n1, const Node *n2)
{
    if (n1->label == n2->label && n1->children.size() == n2->children.size()) {
        return 0;
    }

    const Type type1 = canonizeType(n1->type);
    const Type type2 = canonizeType(n2->type);

    if (type1 >= Type::NonInterchangeable ||
        type2 >= Type::NonInterchangeable ||
        type1 != type2) {
        return Wch;
    }

    if (type1 == Type::Virtual) {
        return (n1->stype == n2->stype) ? Wren : Wch;
    }

    return Wren;

    // std::string n1l;
    // int a = 0, b = 0;
    // for (const Node *child : n1->children) {
    //     if (child->satellite) {
    //         n1l += child->label;
    //         ++a;
    //     }
    // }
    // std::string n2l;
    // for (const Node *child : n2->children) {
    //     if (child->satellite) {
    //         n2l += child->label;
    //         ++b;
    //     }
    // }
    // const bool identicalRename = (n1->label == n2->label && n1l == n2l && n1->children.size() == n2->children.size());
    // return (identicalRename ? 0 : Wren);
}

static void
forestDist(int i, int j, const std::vector<int> &l1, const std::vector<int> &l2,
           boost::multi_array<Change, 2> &td,
           boost::multi_array<int, 2> &fd,
           const std::vector<Node *> &po1, const std::vector<Node *> &po2)
{
    fd[l1[i] - 1][l2[j] - 1] = 0;
    for (int di = l1[i]; di <= i; ++di) {
        fd[di][l2[j] - 1] = fd[di - 1][l2[j] - 1] + Wdel;
    }
    for (int dj = l2[j]; dj <= j; ++dj) {
        fd[l1[i] - 1][dj] = fd[l1[i] - 1][dj - 1] + Wins;
    }
    for (int di = l1[i]; di <= i; ++di) {
        const int ldi = l1[di];
        for (int dj = l2[j]; dj <= j; ++dj) {
            const int ldj = l2[dj];
            if (ldi == l1[i] && ldj == l2[j]) {
                fd[di][dj] = std::min({
                    fd[di - 1][dj] + Wdel,
                    fd[di][dj - 1] + Wins,
                    fd[di - 1][dj - 1] + renameCost(po1[di], po2[dj])
                });
                td[di][dj] = Change { fd[di][dj], i, j };
            } else {
                fd[di][dj] =
                    std::min({ fd[di - 1][dj] + Wdel,
                               fd[di][dj - 1] + Wins,
                               fd[ldi - 1][ldj - 1] + td[di][dj].cost });
            }
        }
    }

    // for (int di = l1[i] - 1; di <= i; ++di) {
    //     for (int dj = l2[j] - 1; dj <= j; ++dj) {
    //         std::cout << std::setw(2) << fd[di][dj] << ' ';
    //     }
    //     std::cout << '\n';
    // }

    // std::cout << "--------\n";

    // for (unsigned int i = 0; i < l1.size(); ++i) {
    //     for (unsigned int j = 0; j < l2.size(); ++j) {
    //         std::string mark;
    //         // switch (td[i][j].state) {
    //         //     case State::Unchanged: mark = "(U)"; break;
    //         //     case State::Deleted:   mark = "(D)";  break;
    //         //     case State::Inserted:  mark = "(I)"; break;
    //         //     case State::Updated:   mark = "(R)";  break;
    //         // }

    //         std::cout << std::setw(2) << td[i][j].cost << mark << ' ';
    //     }
    //     std::cout << '\n';
    // }
}

class BacktrackingQueue
{
    using pair = std::pair<int, int>;

public:
    void enqueue(int i, int j, int di, int dj)
    {
        queue[pair{ i, j }].emplace_back(di, dj);
    }

    std::vector<pair> takeCurrent()
    {
        auto lastItem = --queue.end();
        std::vector<pair> r = lastItem->second;
        queue.erase(lastItem);
        return r;
    }

    bool hasMore() const
    {
        return !queue.empty();
    }

    pair getCurrent() const
    {
        return (--queue.cend())->first;
    }

private:
    std::map<pair, std::vector<pair>> queue;
};

static void
backtrackForests(const std::vector<int> &l1, const std::vector<int> &l2,
                 const boost::multi_array<Change, 2> &td,
                 boost::multi_array<int, 2> &fd,
                 const std::vector<Node *> &po1, const std::vector<Node *> &po2,
                 BacktrackingQueue &bq)
{
    int i, j;
    std::tie(i, j) = bq.getCurrent();

    // This creates forest table identical to the one created on forward pass,
    // but it doesn't update tree table.  Tree table is fully calculated by now,
    // so we can just use its values.
    fd[l1[i] - 1][l2[j] - 1] = 0;
    for (int di = l1[i]; di <= i; ++di) {
        fd[di][l2[j] - 1] = fd[di - 1][l2[j] - 1] + Wdel;
    }
    for (int dj = l2[j]; dj <= j; ++dj) {
        fd[l1[i] - 1][dj] = fd[l1[i] - 1][dj - 1] + Wins;
    }
    for (int di = l1[i]; di <= i; ++di) {
        for (int dj = l2[j]; dj <= j; ++dj) {
            if (l1[di] == l1[i] && l2[dj] == l2[j]) {
                fd[di][dj] = std::min({
                    fd[di - 1][dj] + Wdel,
                    fd[di][dj - 1] + Wins,
                    fd[di - 1][dj - 1] + renameCost(po1[di], po2[dj])
                });
            } else {
                fd[di][dj] =
                    std::min({ fd[di - 1][dj] + Wdel,
                               fd[di][dj - 1] + Wins,
                               fd[l1[di] - 1][l2[dj] - 1] + td[di][dj].cost });
            }
        }
    }

    for (const auto &p : bq.takeCurrent()) {
        int di = p.first, dj = p.second;
        while (di > l1[i] - 1 || dj > l2[j] - 1) {
            if (di == l1[i] - 1) {
                po2[dj--]->state = State::Inserted;
            } else if (dj == l2[j] - 1) {
                po1[di--]->state = State::Deleted;
            } else if (l1[di] == l1[i] && l2[dj] == l2[j]) {
                if (fd[di][dj] == fd[di - 1][dj] + Wdel) {
                    po1[di--]->state = State::Deleted;
                } else if (fd[di][dj] == fd[di][dj - 1] + Wins) {
                    po2[dj--]->state = State::Inserted;
                } else if (fd[di][dj] != fd[di - 1][dj - 1]) {
                    po1[di]->relative = po2[dj];
                    po2[dj]->relative = po1[di];

                    po1[di--]->state = State::Updated;
                    po2[dj--]->state = State::Updated;
                } else {
                    --di;
                    --dj;
                }
            } else {
                if (fd[di][dj] == fd[di - 1][dj] + Wdel) {
                    po1[di--]->state = State::Deleted;
                } else if (fd[di][dj] == fd[di][dj - 1] + Wins) {
                    po2[dj--]->state = State::Inserted;
                } else {
                    bq.enqueue(td[di][dj].i, td[di][dj].j, di, dj);
                    di = l1[di] - 1;
                    dj = l2[dj] - 1;
                }
            }
        }
    }
}

int
ted(Node &T1, Node &T2)
{
    std::vector<Node *> po1 = postOrder(T1);
    std::vector<Node *> po2 = postOrder(T2);

    std::vector<int> l1 = lmld(T1);
    std::vector<int> l2 = lmld(T2);

    boost::multi_array<Change, 2> td(boost::extents[l1.size()][l2.size()]);
    for (unsigned int i = 0; i < l1.size(); ++i) {
        for (unsigned int j = 0; j < l2.size(); ++j) {
            td[i][j].cost = -1;
        }
    }

    std::vector<int> k1 = makeKr(T1, l1);
    std::vector<int> k2 = makeKr(T2, l2);

    using range = boost::multi_array_types::extent_range;
    boost::multi_array<int, 2> fd(boost::extents[range(-1, po1.size())]
                                                [range(-1, po2.size())]);
    // int step = 0;
    for (int x : k1) {
        for (int y : k2) {
            // std::cout << "step #" << ++step << '\n';
            forestDist(x, y, l1, l2, td, fd, po1, po2);
        }
    }

    // Mark nodes with states by backtracking through forest arrays.  We do this
    // in reversed order by regenerating only arrays that are actually needed to
    // recover solution.  Starting with cell containing the answer and figuring
    // out how we get there like in regular dynamic programming.  The difference
    // is that we need to process multiple arrays.  The tracing splits on steps
    // where forests are processed based on information from tree array.
    BacktrackingQueue bq;
    bq.enqueue(k1.back(), k2.back(), l1.size() - 1, l2.size() - 1);
    while (bq.hasMore()) {
        backtrackForests(l1, l2, td, fd, po1, po2, bq);
    }

    return td[l1.size() - 1][l2.size() - 1].cost;
}
Hints

Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://code.reversed.top/user/xaizek/zograscope

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@code.reversed.top/user/xaizek/zograscope

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a pull request:
... clone the repository ...
... make some changes and some commits ...
git push origin master