File src/change-distilling.cpp changed (mode: 100644) (index 80ff3b2..826fe95) |
... |
... |
private: |
115 |
115 |
|
|
116 |
116 |
} |
} |
117 |
117 |
|
|
|
118 |
|
// Method used to determine if two nodes match on overlap. |
|
119 |
|
enum class OverlapKind |
|
120 |
|
{ |
|
121 |
|
Relation, // The nodes were matched with each other. |
|
122 |
|
Token, // The spelling of nodes matches. |
|
123 |
|
}; |
|
124 |
|
|
118 |
125 |
// Description of a single match candidate for matching terminals. |
// Description of a single match candidate for matching terminals. |
119 |
126 |
struct Distiller::TerminalMatch |
struct Distiller::TerminalMatch |
120 |
127 |
{ |
{ |
|
... |
... |
struct Distiller::TerminalMatch |
123 |
130 |
float similarity; // How similar labels of two nodes are in [0.0, 1.0]. |
float similarity; // How similar labels of two nodes are in [0.0, 1.0]. |
124 |
131 |
}; |
}; |
125 |
132 |
|
|
126 |
|
// Computes rate that depends on number and position of neighbouring nodes of |
|
127 |
|
// `x` that match corresponding (by offset) nodes of `y`. This heuristics glues |
|
128 |
|
// unmatched nodes to their already matched neighbours and resolves ties quite |
|
129 |
|
// well. Matched nodes that are closer to the one being analyzed contribute |
|
130 |
|
// more to the rate. |
|
131 |
|
static int |
|
132 |
|
rateOverlap(const Node *x, const std::vector<Node *> &po1, |
|
133 |
|
const Node *y, const std::vector<Node *> &po2) |
|
|
133 |
|
static bool |
|
134 |
|
isAnOverlap(const Node *x, const Node *y, OverlapKind how) |
|
135 |
|
{ |
|
136 |
|
switch (how) { |
|
137 |
|
case OverlapKind::Relation: return (x->relative == y); |
|
138 |
|
case OverlapKind::Token: return (x->label == y->label); |
|
139 |
|
} |
|
140 |
|
return false; |
|
141 |
|
} |
|
142 |
|
|
|
143 |
|
int |
|
144 |
|
Distiller::rateOverlap(const Node *x, const Node *y, OverlapKind how) const |
134 |
145 |
{ |
{ |
135 |
146 |
int overlap = 0; |
int overlap = 0; |
136 |
147 |
|
|
|
... |
... |
rateOverlap(const Node *x, const std::vector<Node *> &po1, |
138 |
149 |
for (int i = 1; i <= maxLeftOffset; ++i) { |
for (int i = 1; i <= maxLeftOffset; ++i) { |
139 |
150 |
int xi = x->poID - i; |
int xi = x->poID - i; |
140 |
151 |
int yi = y->poID - i; |
int yi = y->poID - i; |
141 |
|
overlap += (po1[xi]->relative == po2[yi] ? maxLeftOffset - i + 1 : 0); |
|
|
152 |
|
if (isAnOverlap(po1[xi], po2[yi], how)) { |
|
153 |
|
overlap += maxLeftOffset - i + 1; |
|
154 |
|
} |
142 |
155 |
} |
} |
143 |
156 |
|
|
144 |
157 |
int maxRightOffset = std::min({ static_cast<int>(po1.size()) - 1 - x->poID, |
int maxRightOffset = std::min({ static_cast<int>(po1.size()) - 1 - x->poID, |
|
... |
... |
rateOverlap(const Node *x, const std::vector<Node *> &po1, |
147 |
160 |
for (int i = 1; i <= maxRightOffset; ++i) { |
for (int i = 1; i <= maxRightOffset; ++i) { |
148 |
161 |
int xi = x->poID + i; |
int xi = x->poID + i; |
149 |
162 |
int yi = y->poID + i; |
int yi = y->poID + i; |
150 |
|
overlap += (po1[xi]->relative == po2[yi] ? maxRightOffset - i + 1 : 0); |
|
|
163 |
|
if (isAnOverlap(po1[xi], po2[yi], how)) { |
|
164 |
|
overlap += maxRightOffset - i + 1 + (xi == yi); |
|
165 |
|
} |
151 |
166 |
} |
} |
152 |
167 |
|
|
153 |
168 |
return overlap; |
return overlap; |
|
... |
... |
Distiller::rateTerminalsMatch(const Node *x, const Node *y) const |
160 |
175 |
const Node *yParent = getParent(y); |
const Node *yParent = getParent(y); |
161 |
176 |
|
|
162 |
177 |
if (xParent && xParent->relative && xParent->relative == yParent) { |
if (xParent && xParent->relative && xParent->relative == yParent) { |
163 |
|
return 4 + rateOverlap(x, po1, y, po2); |
|
|
178 |
|
return 4 + rateOverlap(x, y, OverlapKind::Relation); |
164 |
179 |
} |
} |
165 |
180 |
|
|
166 |
181 |
if (haveValues(xParent, yParent)) { |
if (haveValues(xParent, yParent)) { |
|
... |
... |
Distiller::distill(Node &T1, Node &T2) |
187 |
202 |
|
|
188 |
203 |
std::vector<TerminalMatch> matches; |
std::vector<TerminalMatch> matches; |
189 |
204 |
|
|
190 |
|
// First round. |
|
191 |
|
|
|
192 |
205 |
// First time terminal matching. |
// First time terminal matching. |
193 |
206 |
matches = generateTerminalMatches(); |
matches = generateTerminalMatches(); |
194 |
207 |
std::stable_sort(matches.begin(), matches.end(), |
std::stable_sort(matches.begin(), matches.end(), |
195 |
208 |
[&](const TerminalMatch &a, const TerminalMatch &b) { |
[&](const TerminalMatch &a, const TerminalMatch &b) { |
|
209 |
|
// Use overlap rate here too to resolve ties, but |
|
210 |
|
// match token-by-token rather then relations, which |
|
211 |
|
// don't exist yet. |
|
212 |
|
// |
|
213 |
|
// The idea is to get fewer incorrect satellite matches |
|
214 |
|
// which otherwise stick and kinda ruin everything. |
|
215 |
|
|
|
216 |
|
// TODO: cache the rate (and not just here) |
|
217 |
|
if (b.similarity == a.similarity) { |
|
218 |
|
return rateOverlap(b.x, b.y, OverlapKind::Token) |
|
219 |
|
< rateOverlap(a.x, a.y, OverlapKind::Token); |
|
220 |
|
} |
196 |
221 |
return b.similarity < a.similarity; |
return b.similarity < a.similarity; |
197 |
222 |
}); |
}); |
198 |
223 |
applyTerminalMatches(matches); |
applyTerminalMatches(matches); |
|
... |
... |
postOrderAndInitImpl(Node &node, std::vector<Node *> &v) |
289 |
314 |
static void |
static void |
290 |
315 |
clear(Node *node) |
clear(Node *node) |
291 |
316 |
{ |
{ |
292 |
|
if (node->satellite) { |
|
|
317 |
|
// Treat layer breaks as satellites. |
|
318 |
|
if (node->satellite || node->next != nullptr) { |
293 |
319 |
return; |
return; |
294 |
320 |
} |
} |
295 |
321 |
|
|
File src/change-distilling.hpp changed (mode: 100644) (index a5c00a7..8641662) |
... |
... |
class DiceString; |
27 |
27 |
class Language; |
class Language; |
28 |
28 |
class Node; |
class Node; |
29 |
29 |
|
|
|
30 |
|
enum class OverlapKind; |
|
31 |
|
|
30 |
32 |
// Implements change-distilling algorithm. |
// Implements change-distilling algorithm. |
31 |
33 |
class Distiller |
class Distiller |
32 |
34 |
{ |
{ |
|
... |
... |
private: |
57 |
59 |
// Computes rating of a match of terminals, which is to be compared with |
// Computes rating of a match of terminals, which is to be compared with |
58 |
60 |
// ratings of other matches. |
// ratings of other matches. |
59 |
61 |
int rateTerminalsMatch(const Node *x, const Node *y) const; |
int rateTerminalsMatch(const Node *x, const Node *y) const; |
|
62 |
|
// Computes rate that depends on number and position of neighbouring nodes |
|
63 |
|
// of `x` that match corresponding (by offset) nodes of `y`. This |
|
64 |
|
// heuristics glues unmatched nodes to their already matched neighbours and |
|
65 |
|
// resolves ties quite well. Matched nodes that are closer to the one |
|
66 |
|
// being analyzed contribute more to the rate. |
|
67 |
|
int rateOverlap(const Node *x, const Node *y, OverlapKind how) const; |
60 |
68 |
// Retrieves parent of the node possibly skipping container parents. Might |
// Retrieves parent of the node possibly skipping container parents. Might |
61 |
69 |
// return `nullptr`. |
// return `nullptr`. |
62 |
70 |
const Node * getParent(const Node *n) const; |
const Node * getParent(const Node *n) const; |
File tests/c/c-diffing.cpp changed (mode: 100644) (index 28fd0eb..7f5a23f) |
... |
... |
TEST_CASE("Number change", "[comparison]") |
1889 |
1889 |
)", false); |
)", false); |
1890 |
1890 |
} |
} |
1891 |
1891 |
|
|
|
1892 |
|
TEST_CASE("C condition removal", "[comparison]") |
|
1893 |
|
{ |
|
1894 |
|
diffC(R"( |
|
1895 |
|
void f() { |
|
1896 |
|
if ( |
|
1897 |
|
cmd_info->argc == 1 && /// Deletions |
|
1898 |
|
strcasecmp(cmd_info->argv[0], "clear") == 0 /// Moves |
|
1899 |
|
) { |
|
1900 |
|
cs_reset(curr_stats.cs); /// Deletions |
|
1901 |
|
return 0; /// Deletions |
|
1902 |
|
} |
|
1903 |
|
} |
|
1904 |
|
)", R"( |
|
1905 |
|
void f() { |
|
1906 |
|
if ( |
|
1907 |
|
strcasecmp(cmd_info->argv[0], "clear") == 0 /// Moves |
|
1908 |
|
) { |
|
1909 |
|
return highlight_clear(cmd_info); /// Additions |
|
1910 |
|
} |
|
1911 |
|
} |
|
1912 |
|
)", false); |
|
1913 |
|
} |
|
1914 |
|
|
1892 |
1915 |
TEST_CASE("Condition addition", "[comparison]") |
TEST_CASE("Condition addition", "[comparison]") |
1893 |
1916 |
{ |
{ |
1894 |
1917 |
diffC(R"( |
diffC(R"( |
|
... |
... |
TEST_CASE("Condition addition", "[comparison]") |
1903 |
1926 |
} |
} |
1904 |
1927 |
)", false); |
)", false); |
1905 |
1928 |
} |
} |
|
1929 |
|
|
|
1930 |
|
TEST_CASE("Change one of duplicates", "[comparison]") |
|
1931 |
|
{ |
|
1932 |
|
diffC(R"( |
|
1933 |
|
void f() { |
|
1934 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1935 |
|
fsdata_get(fsd, |
|
1936 |
|
SANDBOX_PATH /// Updates |
|
1937 |
|
, &ch, sizeof(ch)); |
|
1938 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1939 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1940 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1941 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1942 |
|
} |
|
1943 |
|
)", R"( |
|
1944 |
|
void f() { |
|
1945 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1946 |
|
fsdata_get(fsd, |
|
1947 |
|
SANDBX_PATH /// Updates |
|
1948 |
|
, &ch, sizeof(ch)); |
|
1949 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1950 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1951 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1952 |
|
fsdata_get(fsd, SANDBOX_PATH, &ch, sizeof(ch)); |
|
1953 |
|
} |
|
1954 |
|
)", false); |
|
1955 |
|
} |
|
1956 |
|
|
|
1957 |
|
TEST_CASE("Case addition", "[comparison]") |
|
1958 |
|
{ |
|
1959 |
|
diffC(R"( |
|
1960 |
|
void f() { |
|
1961 |
|
switch(a) { |
|
1962 |
|
case 'P': |
|
1963 |
|
switch(b) { |
|
1964 |
|
case 'l': |
|
1965 |
|
++x; |
|
1966 |
|
break; |
|
1967 |
|
} |
|
1968 |
|
break; |
|
1969 |
|
} |
|
1970 |
|
} |
|
1971 |
|
)", R"( |
|
1972 |
|
void f() { |
|
1973 |
|
switch(a) { |
|
1974 |
|
case 'l': /// Additions |
|
1975 |
|
break; /// Additions |
|
1976 |
|
case 'P': |
|
1977 |
|
switch(b) { |
|
1978 |
|
case 'l': |
|
1979 |
|
++x; |
|
1980 |
|
break; |
|
1981 |
|
} |
|
1982 |
|
break; |
|
1983 |
|
} |
|
1984 |
|
} |
|
1985 |
|
)", false); |
|
1986 |
|
} |
|
1987 |
|
|
|
1988 |
|
TEST_CASE("Return update", "[comparison]") |
|
1989 |
|
{ |
|
1990 |
|
diffC(R"( |
|
1991 |
|
void f() { |
|
1992 |
|
if(command == NULL) |
|
1993 |
|
{ |
|
1994 |
|
return NULL; /// Deletions |
|
1995 |
|
} |
|
1996 |
|
|
|
1997 |
|
if(job == NULL) |
|
1998 |
|
{ |
|
1999 |
|
return 1; |
|
2000 |
|
} |
|
2001 |
|
} |
|
2002 |
|
)", R"( |
|
2003 |
|
void f() { |
|
2004 |
|
if(command == NULL) |
|
2005 |
|
{ |
|
2006 |
|
return 1; /// Additions |
|
2007 |
|
} |
|
2008 |
|
|
|
2009 |
|
if(job == NULL) |
|
2010 |
|
{ |
|
2011 |
|
return 1; |
|
2012 |
|
} |
|
2013 |
|
} |
|
2014 |
|
)", false); |
|
2015 |
|
} |
|
2016 |
|
|
|
2017 |
|
TEST_CASE("if-statement removal", "[comparison]") |
|
2018 |
|
{ |
|
2019 |
|
diffC(R"( |
|
2020 |
|
void f() { |
|
2021 |
|
if (magic == NULL) { /// Deletions |
|
2022 |
|
return -1; /// Deletions |
|
2023 |
|
} /// Deletions |
|
2024 |
|
|
|
2025 |
|
descr = magic_file(magic, filename); |
|
2026 |
|
if (descr == NULL) { |
|
2027 |
|
return -1; |
|
2028 |
|
} |
|
2029 |
|
} |
|
2030 |
|
)", R"( |
|
2031 |
|
void f() { |
|
2032 |
|
descr = magic_file(magic, filename); |
|
2033 |
|
if (descr == NULL) { |
|
2034 |
|
return -1; |
|
2035 |
|
} |
|
2036 |
|
} |
|
2037 |
|
)", false); |
|
2038 |
|
} |
|
2039 |
|
|
|
2040 |
|
TEST_CASE("Condition update", "[comparison]") |
|
2041 |
|
{ |
|
2042 |
|
diffC(R"( |
|
2043 |
|
void f() { |
|
2044 |
|
if ( |
|
2045 |
|
nitems_res. /// Moves |
|
2046 |
|
value == DCACHE_UNKNOWN /// Deletions |
|
2047 |
|
&& !is_slow_fs /// Moves |
|
2048 |
|
) { |
|
2049 |
|
} |
|
2050 |
|
} |
|
2051 |
|
)", R"( |
|
2052 |
|
void f() { |
|
2053 |
|
if ( |
|
2054 |
|
! /// Additions |
|
2055 |
|
nitems_res. /// Moves |
|
2056 |
|
is_valid /// Additions |
|
2057 |
|
&& !is_slow_fs /// Moves |
|
2058 |
|
) { |
|
2059 |
|
} |
|
2060 |
|
} |
|
2061 |
|
)", false); |
|
2062 |
|
} |