xaizek / zograscope (License: AGPLv3 only) (since 2018-12-07)
Mainly a syntax-aware diff that also provides a number of additional tools.
<root> / src / utils / strings.cpp (bf4267a33358b0eeaf0c7c70031af6a82843d374) (3,789B) (mode 100644) [raw]
// Copyright (C) 2017 xaizek <xaizek@posteo.net>
//
// This file is part of zograscope.
//
// zograscope is free software: you can redistribute it and/or modify
// it under the terms of version 3 of the GNU Affero General Public License as
// published by the Free Software Foundation.
//
// zograscope is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with zograscope.  If not, see <http://www.gnu.org/licenses/>.

#include "utils/strings.hpp"

#include <climits>

#include <iterator>
#include <limits>
#include <utility>

#include <boost/utility/string_ref.hpp>

#include "utils/CountIterator.hpp"

float
DiceString::compare(DiceString &other)
{
    if (s.length() < 2U && other.s.length() < 2U) {
        return (s == other.s) ? 1.0f : 0.0f;
    }
    if (s.length() < 2U || other.s.length() < 2U) {
        return 0.0f;
    }

    const std::vector<unsigned short> &bigrams = getBigrams();
    const std::vector<unsigned short> &otherBigrams = other.getBigrams();
    const int common = std::set_intersection(bigrams.cbegin(), bigrams.cend(),
                                             otherBigrams.cbegin(),
                                             otherBigrams.cend(),
                                             CountIterator())
                      .getCount();

    return (2.0f*common)/(bigrams.size() + otherBigrams.size());
}

const std::vector<unsigned short> &
DiceString::getBigrams()
{
    if (!bigrams.empty() || s.length() < 2U) {
        // Either already computed or there is nothing to compute.
        return bigrams;
    }

    auto makeBigram = [](boost::string_ref s, std::size_t at) {
        return (static_cast<unsigned char>(s[at]) << CHAR_BIT)
              | static_cast<unsigned char>(s[at + 1U]);
    };

    // Using std::sort is fine for very small number of elements.
    if (s.length() < 10000) {
        bigrams.reserve(s.length() - 1U);
        for (std::size_t i = 0U; i < s.length() - 1U; ++i) {
            bigrams.push_back(makeBigram(s, i));
        }
        std::sort(bigrams.begin(), bigrams.end());
        bigrams.erase(std::unique(bigrams.begin(), bigrams.end()),
                      bigrams.end());

        return bigrams;
    }

    // But for string of tenths of thousands characters this works faster (exact
    // threshold yet to be determined).

    const int maxBigrams = std::numeric_limits<unsigned short>::max();
    thread_local std::vector<bool> present(maxBigrams);

    bigrams.reserve(s.length() - 1U);
    for (std::size_t i = 0U; i < s.length() - 1U; ++i) {
        present[makeBigram(s, i)] = true;
    }
    for (int i = 0; i < maxBigrams; ++i) {
        if (present[i]) {
            bigrams.push_back(i);
            present[i] = false;
        }
    }

    return bigrams;
}

std::string &&
normalizeEols(std::string &&str)
{
    for (std::size_t pos = str.find('\r');
         pos != std::string::npos;
         pos = str.find('\r', pos)) {
        if (pos < str.size() - 1 && str[pos + 1] == '\n') {
            str.erase(pos, 1U);
        }
    }

    return std::move(str);
}

void
updatePosition(boost::string_ref str, int tabWidth, int &line, int &col)
{
    while (!str.empty()) {
        switch (str.front()) {
            case '\n':
                ++line;
                col = 1;
                break;
            case '\t':
                col += tabWidth - (col - 1)%tabWidth;
                break;

            default:
                ++col;
                break;
        }
        str.remove_prefix(1);
    }
}
Hints

Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://code.reversed.top/user/xaizek/zograscope

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@code.reversed.top/user/xaizek/zograscope

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a pull request:
... clone the repository ...
... make some changes and some commits ...
git push origin master