xaizek / zograscope (License: AGPLv3 only) (since 2018-12-07)
Mainly a syntax-aware diff that also provides a number of additional tools.
Commit a7c99f472ff053514fd76992dfd0ff35f04ec652

Use label-based overlapping for satellite matching
This further improves matching and avoids unnecessary moves.

There can be regressions, but I've seen significant improvements and a
few and much less significant regressions.
Author: xaizek
Author date (UTC): 2022-07-25 14:46
Committer name: xaizek
Committer date (UTC): 2022-07-25 14:46
Parent(s): 5f1bc1fe40b57677ecbf75e22bf5671d6467763d
Signing key: 99DC5E4DB05F6BE2
Tree: 1735ecbb6fa3d47e25c898d2f3dc89657ee23d82
File Lines added Lines deleted
src/change-distilling.cpp 40 14
src/change-distilling.hpp 8 0
tests/Printer.cpp 2 2
tests/c/c-diffing.cpp 157 0
tests/srcml/cxx/srcml-cxx-diffing.cpp 61 0
tests/ts/lua/ts-lua-diffing.cpp 21 0
File src/change-distilling.cpp changed (mode: 100644) (index 80ff3b2..826fe95)
... ... private:
115 115
116 116 } }
117 117
118 // Method used to determine if two nodes match on overlap.
119 enum class OverlapKind
120 {
121 Relation, // The nodes were matched with each other.
122 Token, // The spelling of nodes matches.
123 };
124
118 125 // Description of a single match candidate for matching terminals. // Description of a single match candidate for matching terminals.
119 126 struct Distiller::TerminalMatch struct Distiller::TerminalMatch
120 127 { {
 
... ... struct Distiller::TerminalMatch
123 130 float similarity; // How similar labels of two nodes are in [0.0, 1.0]. float similarity; // How similar labels of two nodes are in [0.0, 1.0].
124 131 }; };
125 132
126 // Computes rate that depends on number and position of neighbouring nodes of
127 // `x` that match corresponding (by offset) nodes of `y`. This heuristics glues
128 // unmatched nodes to their already matched neighbours and resolves ties quite
129 // well. Matched nodes that are closer to the one being analyzed contribute
130 // more to the rate.
131 static int
132 rateOverlap(const Node *x, const std::vector<Node *> &po1,
133 const Node *y, const std::vector<Node *> &po2)
133 static bool
134 isAnOverlap(const Node *x, const Node *y, OverlapKind how)
135 {
136 switch (how) {
137 case OverlapKind::Relation: return (x->relative == y);
138 case OverlapKind::Token: return (x->label == y->label);
139 }
140 return false;
141 }
142
143 int
144 Distiller::rateOverlap(const Node *x, const Node *y, OverlapKind how) const
134 145 { {
135 146 int overlap = 0; int overlap = 0;
136 147
 
... ... rateOverlap(const Node *x, const std::vector<Node *> &po1,
138 149 for (int i = 1; i <= maxLeftOffset; ++i) { for (int i = 1; i <= maxLeftOffset; ++i) {
139 150 int xi = x->poID - i; int xi = x->poID - i;
140 151 int yi = y->poID - i; int yi = y->poID - i;
141 overlap += (po1[xi]->relative == po2[yi] ? maxLeftOffset - i + 1 : 0);
152 if (isAnOverlap(po1[xi], po2[yi], how)) {
153 overlap += maxLeftOffset - i + 1;
154 }
142 155 } }
143 156
144 157 int maxRightOffset = std::min({ static_cast<int>(po1.size()) - 1 - x->poID, int maxRightOffset = std::min({ static_cast<int>(po1.size()) - 1 - x->poID,
 
... ... rateOverlap(const Node *x, const std::vector<Node *> &po1,
147 160 for (int i = 1; i <= maxRightOffset; ++i) { for (int i = 1; i <= maxRightOffset; ++i) {
148 161 int xi = x->poID + i; int xi = x->poID + i;
149 162 int yi = y->poID + i; int yi = y->poID + i;
150 overlap += (po1[xi]->relative == po2[yi] ? maxRightOffset - i + 1 : 0);
163 if (isAnOverlap(po1[xi], po2[yi], how)) {
164 overlap += maxRightOffset - i + 1 + (xi == yi);
165 }
151 166 } }
152 167
153 168 return overlap; return overlap;
 
... ... Distiller::rateTerminalsMatch(const Node *x, const Node *y) const
160 175 const Node *yParent = getParent(y); const Node *yParent = getParent(y);
161 176
162 177 if (xParent && xParent->relative && xParent->relative == yParent) { if (xParent && xParent->relative && xParent->relative == yParent) {
163 return 4 + rateOverlap(x, po1, y, po2);
178 return 4 + rateOverlap(x, y, OverlapKind::Relation);
164 179 } }
165 180
166 181 if (haveValues(xParent, yParent)) { if (haveValues(xParent, yParent)) {
 
... ... Distiller::distill(Node &T1, Node &T2)
187 202
188 203 std::vector<TerminalMatch> matches; std::vector<TerminalMatch> matches;
189 204
190 // First round.
191
192 205 // First time terminal matching. // First time terminal matching.
193 206 matches = generateTerminalMatches(); matches = generateTerminalMatches();
194 207 std::stable_sort(matches.begin(), matches.end(), std::stable_sort(matches.begin(), matches.end(),
195 208 [&](const TerminalMatch &a, const TerminalMatch &b) { [&](const TerminalMatch &a, const TerminalMatch &b) {
209 // Use overlap rate here too to resolve ties, but
210 // match token-by-token rather then relations, which
211 // don't exist yet.
212 //
213 // The idea is to get fewer incorrect satellite matches
214 // which otherwise stick and kinda ruin everything.
215
216 // TODO: cache the rate (and not just here)
217 if (b.similarity == a.similarity) {
218 return rateOverlap(b.x, b.y, OverlapKind::Token)
219 < rateOverlap(a.x, a.y, OverlapKind::Token);
220 }
196 221 return b.similarity < a.similarity; return b.similarity < a.similarity;
197 222 }); });
198 223 applyTerminalMatches(matches); applyTerminalMatches(matches);
 
... ... postOrderAndInitImpl(Node &node, std::vector<Node *> &v)
289 314 static void static void
290 315 clear(Node *node) clear(Node *node)
291 316 { {
292 if (node->satellite) {
317 // Treat layer breaks as satellites.
318 if (node->satellite || node->next != nullptr) {
293 319 return; return;
294 320 } }
295 321
File src/change-distilling.hpp changed (mode: 100644) (index a5c00a7..8641662)
... ... class DiceString;
27 27 class Language; class Language;
28 28 class Node; class Node;
29 29
30 enum class OverlapKind;
31
30 32 // Implements change-distilling algorithm. // Implements change-distilling algorithm.
31 33 class Distiller class Distiller
32 34 { {
 
... ... private:
57 59 // Computes rating of a match of terminals, which is to be compared with // Computes rating of a match of terminals, which is to be compared with
58 60 // ratings of other matches. // ratings of other matches.
59 61 int rateTerminalsMatch(const Node *x, const Node *y) const; int rateTerminalsMatch(const Node *x, const Node *y) const;
62 // Computes rate that depends on number and position of neighbouring nodes
63 // of `x` that match corresponding (by offset) nodes of `y`. This
64 // heuristics glues unmatched nodes to their already matched neighbours and
65 // resolves ties quite well. Matched nodes that are closer to the one
66 // being analyzed contribute more to the rate.
67 int rateOverlap(const Node *x, const Node *y, OverlapKind how) const;
60 68 // Retrieves parent of the node possibly skipping container parents. Might // Retrieves parent of the node possibly skipping container parents. Might
61 69 // return `nullptr`. // return `nullptr`.
62 70 const Node * getParent(const Node *n) const; const Node * getParent(const Node *n) const;
File tests/Printer.cpp changed (mode: 100644) (index dd8a2ea..4ad7b5f)
... ... TEST_CASE("Comment contents is marked as moved on move", "[printer]")
129 129 R"(void f() { R"(void f() {
130 130 /* This is bad. */ /* This is bad. */
131 131 } }
132 )"), parseC(
132 )", true), parseC(
133 133 R"(void f() { R"(void f() {
134 134 { {
135 135 /* Failure is bad. */ /* Failure is bad. */
136 136 } }
137 137 } }
138 )"), true);
138 )", true), false);
139 139
140 140 std::string expected = normalizeText(R"( std::string expected = normalizeText(R"(
141 141 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
File tests/c/c-diffing.cpp changed (mode: 100644) (index 28fd0eb..7f5a23f)
... ... TEST_CASE("Number change", "[comparison]")
1889 1889 )", false); )", false);
1890 1890 } }
1891 1891
1892 TEST_CASE("C condition removal", "[comparison]")
1893 {
1894 diffC(R"(
1895 void f() {
1896 if (
1897 cmd_info->argc == 1 && /// Deletions
1898 strcasecmp(cmd_info->argv[0], "clear") == 0 /// Moves
1899 ) {
1900 cs_reset(curr_stats.cs); /// Deletions
1901 return 0; /// Deletions
1902 }
1903 }
1904 )", R"(
1905 void f() {
1906 if (
1907 strcasecmp(cmd_info->argv[0], "clear") == 0 /// Moves
1908 ) {
1909 return highlight_clear(cmd_info); /// Additions
1910 }
1911 }
1912 )", false);
1913 }
1914
1892 1915 TEST_CASE("Condition addition", "[comparison]") TEST_CASE("Condition addition", "[comparison]")
1893 1916 { {
1894 1917 diffC(R"( diffC(R"(
 
... ... TEST_CASE("Condition addition", "[comparison]")
1903 1926 } }
1904 1927 )", false); )", false);
1905 1928 } }
1929
1930 TEST_CASE("Change one of duplicates", "[comparison]")
1931 {
1932 diffC(R"(
1933 void f() {
1934 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1935 fsdata_get(fsd,
1936 SANDBOX_PATH /// Updates
1937 , &ch, sizeof(ch));
1938 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1939 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1940 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1941 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1942 }
1943 )", R"(
1944 void f() {
1945 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1946 fsdata_get(fsd,
1947 SANDBX_PATH /// Updates
1948 , &ch, sizeof(ch));
1949 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1950 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1951 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1952 fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch));
1953 }
1954 )", false);
1955 }
1956
1957 TEST_CASE("Case addition", "[comparison]")
1958 {
1959 diffC(R"(
1960 void f() {
1961 switch(a) {
1962 case 'P':
1963 switch(b) {
1964 case 'l':
1965 ++x;
1966 break;
1967 }
1968 break;
1969 }
1970 }
1971 )", R"(
1972 void f() {
1973 switch(a) {
1974 case 'l': /// Additions
1975 break; /// Additions
1976 case 'P':
1977 switch(b) {
1978 case 'l':
1979 ++x;
1980 break;
1981 }
1982 break;
1983 }
1984 }
1985 )", false);
1986 }
1987
1988 TEST_CASE("Return update", "[comparison]")
1989 {
1990 diffC(R"(
1991 void f() {
1992 if(command == NULL)
1993 {
1994 return NULL; /// Deletions
1995 }
1996
1997 if(job == NULL)
1998 {
1999 return 1;
2000 }
2001 }
2002 )", R"(
2003 void f() {
2004 if(command == NULL)
2005 {
2006 return 1; /// Additions
2007 }
2008
2009 if(job == NULL)
2010 {
2011 return 1;
2012 }
2013 }
2014 )", false);
2015 }
2016
2017 TEST_CASE("if-statement removal", "[comparison]")
2018 {
2019 diffC(R"(
2020 void f() {
2021 if (magic == NULL) { /// Deletions
2022 return -1; /// Deletions
2023 } /// Deletions
2024
2025 descr = magic_file(magic, filename);
2026 if (descr == NULL) {
2027 return -1;
2028 }
2029 }
2030 )", R"(
2031 void f() {
2032 descr = magic_file(magic, filename);
2033 if (descr == NULL) {
2034 return -1;
2035 }
2036 }
2037 )", false);
2038 }
2039
2040 TEST_CASE("Condition update", "[comparison]")
2041 {
2042 diffC(R"(
2043 void f() {
2044 if (
2045 nitems_res. /// Moves
2046 value == DCACHE_UNKNOWN /// Deletions
2047 && !is_slow_fs /// Moves
2048 ) {
2049 }
2050 }
2051 )", R"(
2052 void f() {
2053 if (
2054 ! /// Additions
2055 nitems_res. /// Moves
2056 is_valid /// Additions
2057 && !is_slow_fs /// Moves
2058 ) {
2059 }
2060 }
2061 )", false);
2062 }
File tests/srcml/cxx/srcml-cxx-diffing.cpp changed (mode: 100644) (index 7645b72..ae66c74)
... ... TEST_CASE("Condition removal", "[.srcml][srcml-cxx][comparison]")
128 128 } }
129 129 )123"); )123");
130 130 } }
131
132 TEST_CASE("C++ condition addition", "[.srcml][srcml-cxx][comparison]")
133 {
134 diffSrcmlCxx(R"(
135 void f() {
136 if (fullVal.back() == ';') {
137 }
138 }
139 )", R"(
140 void f() {
141 if (
142 fullVal.back() == ':' || /// Additions
143 fullVal.back() == ';') {
144 }
145 }
146 )");
147 }
148
149 TEST_CASE("C++ subexpr addition", "[.srcml][srcml-cxx][comparison]")
150 {
151 diffSrcmlCxx(R"(
152 void f() {
153 return -child->stype != SrcmlCxxSType::Function;
154 }
155 )", R"(
156 void f() {
157 return
158 -child->stype != SrcmlCxxSType::Parameter && /// Additions
159 -child->stype != SrcmlCxxSType::Function;
160 }
161 )");
162 }
163
164 TEST_CASE("C++ statement deletion", "[.srcml][srcml-cxx][comparison]")
165 {
166 diffSrcmlCxx(R"(
167 void f() {
168 for (int i : is) {
169 if (ci[i]->satellite) { /// Deletions
170 continue; /// Deletions
171 } /// Deletions
172
173 for (int j : js) {
174 if (cj[j]->satellite) {
175 continue;
176 }
177 }
178 }
179 }
180 )", R"(
181 void f() {
182 for (int i : is) {
183 for (int j : js) {
184 if (cj[j]->satellite) {
185 continue;
186 }
187 }
188 }
189 }
190 )");
191 }
File tests/ts/lua/ts-lua-diffing.cpp changed (mode: 100644) (index 1fd1627..927e6d0)
... ... TEST_CASE("Lua functions are matched", "[ts-lua][comparison]")
40 40 end end
41 41 )"); )");
42 42 } }
43
44 TEST_CASE("Variable is moved to table value", "[ts-lua][comparison]")
45 {
46 diffTsLua(R"(
47 function f()
48 something()
49 return
50 text --- Moves
51 end
52 )", R"(
53 function f()
54 something()
55 return
56 { --- Additions
57 text --- Additions
58 = --- Additions
59 text --- Moves
60 } --- Additions
61 end
62 )");
63 }
Hints

Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://code.reversed.top/user/xaizek/zograscope

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@code.reversed.top/user/xaizek/zograscope

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a pull request:
... clone the repository ...
... make some changes and some commits ...
git push origin master