Tim Quatmann
6 years ago
2 changed files with 126 additions and 0 deletions
@ -0,0 +1,73 @@ |
|||
#include "storm/utility/string.h"
|
|||
#include <vector>
|
|||
#include <boost/algorithm/string/join.hpp>
|
|||
|
|||
namespace storm { |
|||
namespace utility { |
|||
namespace string { |
|||
|
|||
SimilarStrings::SimilarStrings(std::string reference, double similarityFactor, bool caseSensitive) : reference(reference), similarityFactor(similarityFactor), caseSensitive(caseSensitive), cmp([](std::pair<uint64_t, std::string> const& lhs, std::pair<uint64_t, std::string> const& rhs) { return lhs.first > rhs.first; }), distances(cmp) { |
|||
// intentionally left empty.
|
|||
} |
|||
|
|||
bool SimilarStrings::add(std::string const& string) { |
|||
double distance = levenshteinDistance(reference, string, caseSensitive); |
|||
if (distance <= static_cast<double>(std::max(reference.size(), string.size())) * (1.0 - similarityFactor)) { |
|||
distances.emplace(storm::utility::string::levenshteinDistance(reference, string, caseSensitive), string); |
|||
return true; |
|||
} |
|||
return false; |
|||
} |
|||
|
|||
std::vector<std::string> SimilarStrings::toList() const { |
|||
auto distancesCopy = distances; |
|||
std::vector<std::string> result; |
|||
while (!distancesCopy.empty()) { |
|||
result.push_back(distancesCopy.top().second); |
|||
distancesCopy.pop(); |
|||
} |
|||
return result; |
|||
} |
|||
|
|||
std::string SimilarStrings::toDidYouMeanString() const { |
|||
uint64_t size = distances.size(); |
|||
std::string result = boost::algorithm::join(toList(), ", "); |
|||
if (size == 0) { |
|||
return ""; |
|||
} else if (size == 1) { |
|||
return "Did you mean " + result + "?"; |
|||
} else { |
|||
return "Did you mean any of [" + result + "]?"; |
|||
} |
|||
} |
|||
|
|||
|
|||
uint64_t levenshteinDistance(std::string const& lhs, std::string const& rhs, bool caseSensitive) { |
|||
std::vector<std::vector<uint64_t>> d(lhs.size() + 1, std::vector<uint64_t>(rhs.size() + 1, 0ull)); |
|||
for (uint64_t row = 1; row < d.size(); ++row) { |
|||
d[row].front() = row; |
|||
} |
|||
for (uint64_t col = 1; col < d.front().size(); ++col) { |
|||
d.front()[col] = col; |
|||
} |
|||
|
|||
for (uint64_t row = 1; row < d.size(); ++row) { |
|||
for (uint64_t col = 1; col < d[row].size(); ++col) { |
|||
uint64_t cost = 1; |
|||
if (caseSensitive) { |
|||
if (tolower(lhs[row-1]) == tolower(rhs[col-1])) { |
|||
cost = 0; |
|||
} |
|||
} else { |
|||
if (lhs[row-1] == rhs[col-1]) { |
|||
cost = 0; |
|||
} |
|||
} |
|||
d[row][col] = std::min( { d[row-1][col] + 1, d[row][col - 1] + 1, d[row-1][col-1] + cost } ); |
|||
} |
|||
} |
|||
return d.back().back(); |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,53 @@ |
|||
#pragma once |
|||
|
|||
#include <string> |
|||
#include <functional> |
|||
#include <queue> |
|||
|
|||
namespace storm { |
|||
namespace utility { |
|||
namespace string { |
|||
|
|||
class SimilarStrings { |
|||
public: |
|||
/*! |
|||
* Gathers strings that are similar to the given reference string |
|||
* @param reference |
|||
* @param similarityFactor controls how similar the strings need to be (0 means any string is similar, 1 means only the reference string is similar) |
|||
* @param caseSensitive if false, lower/upper case is ignored |
|||
*/ |
|||
SimilarStrings(std::string reference, double similarityFactor = 0.6, bool caseSensitive = true); |
|||
|
|||
/*! |
|||
* Adds the given string to the set of similar strings (if it is similar) |
|||
* @return true, if the given string is considered similar. |
|||
*/ |
|||
bool add(std::string const& string); |
|||
|
|||
/*! |
|||
* Gets a list of all added strings that are similar to the reference string. |
|||
* Erases all strings gathered so far. |
|||
*/ |
|||
std::vector<std::string> toList() const; |
|||
|
|||
/*! |
|||
* Returns a "Did you mean abc?" string |
|||
* @return |
|||
*/ |
|||
std::string toDidYouMeanString() const; |
|||
|
|||
private: |
|||
std::string reference; |
|||
double similarityFactor; |
|||
bool caseSensitive; |
|||
std::function<bool (std::pair<uint64_t, std::string> const&, std::pair<uint64_t, std::string> const&)> cmp; |
|||
std::priority_queue<std::pair<uint64_t, std::string>, std::vector<std::pair<uint64_t, std::string>>, decltype(cmp)> distances; |
|||
}; |
|||
|
|||
/*! |
|||
* Levenstein distance to find similar strings |
|||
*/ |
|||
uint64_t levenshteinDistance(std::string const& lhs, std::string const& rhs, bool caseSensitive = true); |
|||
} |
|||
} |
|||
} |
Reference in new issue
xxxxxxxxxx