From 27c2a8ba951a4cf219b0445df780794d0dbd1d88 Mon Sep 17 00:00:00 2001 From: Tim Quatmann Date: Tue, 19 Mar 2019 16:04:46 +0100 Subject: [PATCH] Added string utility functions to find similar strings. --- src/storm/utility/string.cpp | 73 ++++++++++++++++++++++++++++++++++++ src/storm/utility/string.h | 53 ++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 src/storm/utility/string.cpp create mode 100644 src/storm/utility/string.h diff --git a/src/storm/utility/string.cpp b/src/storm/utility/string.cpp new file mode 100644 index 000000000..536bacf24 --- /dev/null +++ b/src/storm/utility/string.cpp @@ -0,0 +1,73 @@ +#include "storm/utility/string.h" +#include +#include + +namespace storm { + namespace utility { + namespace string { + + SimilarStrings::SimilarStrings(std::string reference, double similarityFactor, bool caseSensitive) : reference(reference), similarityFactor(similarityFactor), caseSensitive(caseSensitive), cmp([](std::pair const& lhs, std::pair const& rhs) { return lhs.first > rhs.first; }), distances(cmp) { + // intentionally left empty. + } + + bool SimilarStrings::add(std::string const& string) { + double distance = levenshteinDistance(reference, string, caseSensitive); + if (distance <= static_cast(std::max(reference.size(), string.size())) * (1.0 - similarityFactor)) { + distances.emplace(storm::utility::string::levenshteinDistance(reference, string, caseSensitive), string); + return true; + } + return false; + } + + std::vector SimilarStrings::toList() const { + auto distancesCopy = distances; + std::vector result; + while (!distancesCopy.empty()) { + result.push_back(distancesCopy.top().second); + distancesCopy.pop(); + } + return result; + } + + std::string SimilarStrings::toDidYouMeanString() const { + uint64_t size = distances.size(); + std::string result = boost::algorithm::join(toList(), ", "); + if (size == 0) { + return ""; + } else if (size == 1) { + return "Did you mean " + result + "?"; + } else { + return "Did you mean any of [" + result + "]?"; + } + } + + + uint64_t levenshteinDistance(std::string const& lhs, std::string const& rhs, bool caseSensitive) { + std::vector> d(lhs.size() + 1, std::vector(rhs.size() + 1, 0ull)); + for (uint64_t row = 1; row < d.size(); ++row) { + d[row].front() = row; + } + for (uint64_t col = 1; col < d.front().size(); ++col) { + d.front()[col] = col; + } + + for (uint64_t row = 1; row < d.size(); ++row) { + for (uint64_t col = 1; col < d[row].size(); ++col) { + uint64_t cost = 1; + if (caseSensitive) { + if (tolower(lhs[row-1]) == tolower(rhs[col-1])) { + cost = 0; + } + } else { + if (lhs[row-1] == rhs[col-1]) { + cost = 0; + } + } + d[row][col] = std::min( { d[row-1][col] + 1, d[row][col - 1] + 1, d[row-1][col-1] + cost } ); + } + } + return d.back().back(); + } + } + } +} \ No newline at end of file diff --git a/src/storm/utility/string.h b/src/storm/utility/string.h new file mode 100644 index 000000000..6b44992c5 --- /dev/null +++ b/src/storm/utility/string.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include + +namespace storm { + namespace utility { + namespace string { + + class SimilarStrings { + public: + /*! + * Gathers strings that are similar to the given reference string + * @param reference + * @param similarityFactor controls how similar the strings need to be (0 means any string is similar, 1 means only the reference string is similar) + * @param caseSensitive if false, lower/upper case is ignored + */ + SimilarStrings(std::string reference, double similarityFactor = 0.6, bool caseSensitive = true); + + /*! + * Adds the given string to the set of similar strings (if it is similar) + * @return true, if the given string is considered similar. + */ + bool add(std::string const& string); + + /*! + * Gets a list of all added strings that are similar to the reference string. + * Erases all strings gathered so far. + */ + std::vector toList() const; + + /*! + * Returns a "Did you mean abc?" string + * @return + */ + std::string toDidYouMeanString() const; + + private: + std::string reference; + double similarityFactor; + bool caseSensitive; + std::function const&, std::pair const&)> cmp; + std::priority_queue, std::vector>, decltype(cmp)> distances; + }; + + /*! + * Levenstein distance to find similar strings + */ + uint64_t levenshteinDistance(std::string const& lhs, std::string const& rhs, bool caseSensitive = true); + } + } +} \ No newline at end of file