Tim Quatmann
6 years ago
2 changed files with 126 additions and 0 deletions
@ -0,0 +1,73 @@ |
|||||
|
#include "storm/utility/string.h"
|
||||
|
#include <vector>
|
||||
|
#include <boost/algorithm/string/join.hpp>
|
||||
|
|
||||
|
namespace storm { |
||||
|
namespace utility { |
||||
|
namespace string { |
||||
|
|
||||
|
SimilarStrings::SimilarStrings(std::string reference, double similarityFactor, bool caseSensitive) : reference(reference), similarityFactor(similarityFactor), caseSensitive(caseSensitive), cmp([](std::pair<uint64_t, std::string> const& lhs, std::pair<uint64_t, std::string> const& rhs) { return lhs.first > rhs.first; }), distances(cmp) { |
||||
|
// intentionally left empty.
|
||||
|
} |
||||
|
|
||||
|
bool SimilarStrings::add(std::string const& string) { |
||||
|
double distance = levenshteinDistance(reference, string, caseSensitive); |
||||
|
if (distance <= static_cast<double>(std::max(reference.size(), string.size())) * (1.0 - similarityFactor)) { |
||||
|
distances.emplace(storm::utility::string::levenshteinDistance(reference, string, caseSensitive), string); |
||||
|
return true; |
||||
|
} |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
std::vector<std::string> SimilarStrings::toList() const { |
||||
|
auto distancesCopy = distances; |
||||
|
std::vector<std::string> result; |
||||
|
while (!distancesCopy.empty()) { |
||||
|
result.push_back(distancesCopy.top().second); |
||||
|
distancesCopy.pop(); |
||||
|
} |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
std::string SimilarStrings::toDidYouMeanString() const { |
||||
|
uint64_t size = distances.size(); |
||||
|
std::string result = boost::algorithm::join(toList(), ", "); |
||||
|
if (size == 0) { |
||||
|
return ""; |
||||
|
} else if (size == 1) { |
||||
|
return "Did you mean " + result + "?"; |
||||
|
} else { |
||||
|
return "Did you mean any of [" + result + "]?"; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
|
||||
|
uint64_t levenshteinDistance(std::string const& lhs, std::string const& rhs, bool caseSensitive) { |
||||
|
std::vector<std::vector<uint64_t>> d(lhs.size() + 1, std::vector<uint64_t>(rhs.size() + 1, 0ull)); |
||||
|
for (uint64_t row = 1; row < d.size(); ++row) { |
||||
|
d[row].front() = row; |
||||
|
} |
||||
|
for (uint64_t col = 1; col < d.front().size(); ++col) { |
||||
|
d.front()[col] = col; |
||||
|
} |
||||
|
|
||||
|
for (uint64_t row = 1; row < d.size(); ++row) { |
||||
|
for (uint64_t col = 1; col < d[row].size(); ++col) { |
||||
|
uint64_t cost = 1; |
||||
|
if (caseSensitive) { |
||||
|
if (tolower(lhs[row-1]) == tolower(rhs[col-1])) { |
||||
|
cost = 0; |
||||
|
} |
||||
|
} else { |
||||
|
if (lhs[row-1] == rhs[col-1]) { |
||||
|
cost = 0; |
||||
|
} |
||||
|
} |
||||
|
d[row][col] = std::min( { d[row-1][col] + 1, d[row][col - 1] + 1, d[row-1][col-1] + cost } ); |
||||
|
} |
||||
|
} |
||||
|
return d.back().back(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
@ -0,0 +1,53 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include <string> |
||||
|
#include <functional> |
||||
|
#include <queue> |
||||
|
|
||||
|
namespace storm { |
||||
|
namespace utility { |
||||
|
namespace string { |
||||
|
|
||||
|
class SimilarStrings { |
||||
|
public: |
||||
|
/*! |
||||
|
* Gathers strings that are similar to the given reference string |
||||
|
* @param reference |
||||
|
* @param similarityFactor controls how similar the strings need to be (0 means any string is similar, 1 means only the reference string is similar) |
||||
|
* @param caseSensitive if false, lower/upper case is ignored |
||||
|
*/ |
||||
|
SimilarStrings(std::string reference, double similarityFactor = 0.6, bool caseSensitive = true); |
||||
|
|
||||
|
/*! |
||||
|
* Adds the given string to the set of similar strings (if it is similar) |
||||
|
* @return true, if the given string is considered similar. |
||||
|
*/ |
||||
|
bool add(std::string const& string); |
||||
|
|
||||
|
/*! |
||||
|
* Gets a list of all added strings that are similar to the reference string. |
||||
|
* Erases all strings gathered so far. |
||||
|
*/ |
||||
|
std::vector<std::string> toList() const; |
||||
|
|
||||
|
/*! |
||||
|
* Returns a "Did you mean abc?" string |
||||
|
* @return |
||||
|
*/ |
||||
|
std::string toDidYouMeanString() const; |
||||
|
|
||||
|
private: |
||||
|
std::string reference; |
||||
|
double similarityFactor; |
||||
|
bool caseSensitive; |
||||
|
std::function<bool (std::pair<uint64_t, std::string> const&, std::pair<uint64_t, std::string> const&)> cmp; |
||||
|
std::priority_queue<std::pair<uint64_t, std::string>, std::vector<std::pair<uint64_t, std::string>>, decltype(cmp)> distances; |
||||
|
}; |
||||
|
|
||||
|
/*! |
||||
|
* Levenstein distance to find similar strings |
||||
|
*/ |
||||
|
uint64_t levenshteinDistance(std::string const& lhs, std::string const& rhs, bool caseSensitive = true); |
||||
|
} |
||||
|
} |
||||
|
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue