diff --git a/regression/cbmc/unknown-argument-suggestion/test.desc b/regression/cbmc/unknown-argument-suggestion/test.desc new file mode 100644 index 00000000000..bf1204f5842 --- /dev/null +++ b/regression/cbmc/unknown-argument-suggestion/test.desc @@ -0,0 +1,12 @@ +CORE +dummy.c +--traec +did you mean --trace +^EXIT=(1|64)$ +^SIGNAL=0$ +-- +-- +This checks that we get a useful suggestion when we make a typo on the +commandline. + +The error code is 1 on linux/osx and 64 on windows for some reason. diff --git a/regression/validate-trace-xml-schema/check.py b/regression/validate-trace-xml-schema/check.py index 2d4b55f6c95..684ed69ef8f 100644 --- a/regression/validate-trace-xml-schema/check.py +++ b/regression/validate-trace-xml-schema/check.py @@ -33,6 +33,7 @@ # these test for invalid command line handling 'bad_option/test_multiple.desc', 'bad_option/test.desc', + 'unknown-argument-suggestion/test.desc', # this one produces XML intermingled with main XML output when used with --xml-ui 'graphml_witness2/test.desc', # produces intermingled XML on the command line diff --git a/src/util/Makefile b/src/util/Makefile index 1bef87393ee..8e800b92ded 100644 --- a/src/util/Makefile +++ b/src/util/Makefile @@ -11,6 +11,7 @@ SRC = allocate_objects.cpp \ cout_message.cpp \ dstring.cpp \ endianness_map.cpp \ + edit_distance.cpp \ expr.cpp \ expr_initializer.cpp \ expr_util.cpp \ diff --git a/src/util/cmdline.cpp b/src/util/cmdline.cpp index b4a17cb2483..6fda3821ff9 100644 --- a/src/util/cmdline.cpp +++ b/src/util/cmdline.cpp @@ -8,6 +8,7 @@ Author: Daniel Kroening, kroening@kroening.com #include "cmdline.h" +#include #include #include @@ -250,6 +251,66 @@ cmdlinet::option_namest cmdlinet::option_names() const return option_namest{*this}; } +std::vector +cmdlinet::get_argument_suggestions(const std::string &unknown_argument) +{ + struct suggestiont + { + std::size_t distance; + std::string suggestion; + + bool operator<(const suggestiont &other) const + { + return distance < other.distance; + } + }; + + auto argument_suggestions = std::vector{}; + // We allow 3 errors here. This can lead to the output being a bit chatty, + // which we mitigate by reducing suggestions to those with the minimum + // distance further down below + const auto argument_matcher = levenshtein_automatont{unknown_argument, 3}; + for(const auto &option : options) + { + if(option.islong) + { + const auto long_name = "--" + option.optstring; + if(auto distance = argument_matcher.get_edit_distance(long_name)) + { + argument_suggestions.push_back({distance.value(), long_name}); + } + } + if(!option.islong) + { + const auto short_name = std::string{"-"} + option.optchar; + if(auto distance = argument_matcher.get_edit_distance(short_name)) + { + argument_suggestions.push_back({distance.value(), short_name}); + } + } + } + + auto final_suggestions = std::vector{}; + if(!argument_suggestions.empty()) + { + // we only want to keep suggestions with the minimum distance + // because otherwise they become quickly too noisy to be useful + auto min = std::min_element( + argument_suggestions.begin(), argument_suggestions.end()); + INVARIANT( + min != argument_suggestions.end(), + "there is a minimum because it's not empty"); + for(auto const &suggestion : argument_suggestions) + { + if(suggestion.distance == min->distance) + { + final_suggestions.push_back(suggestion.suggestion); + } + } + } + return final_suggestions; +} + cmdlinet::option_namest::option_names_iteratort::option_names_iteratort( const cmdlinet *command_line, std::size_t index) diff --git a/src/util/cmdline.h b/src/util/cmdline.h index 3c15900155c..127bc59cbff 100644 --- a/src/util/cmdline.h +++ b/src/util/cmdline.h @@ -94,6 +94,9 @@ class cmdlinet cmdlinet(); virtual ~cmdlinet(); + std::vector + get_argument_suggestions(const std::string &unknown_argument); + protected: struct optiont { diff --git a/src/util/edit_distance.cpp b/src/util/edit_distance.cpp new file mode 100644 index 00000000000..5c634b7873f --- /dev/null +++ b/src/util/edit_distance.cpp @@ -0,0 +1,73 @@ +/// \file +/// \author Diffblue Ltd. +/// +/// Provides a way to compute edit distance between two strings + +#include "edit_distance.h" + +levenshtein_automatont::levenshtein_automatont( + const std::string &string, + std::size_t allowed_errors) +{ + const std::size_t layer_offset = string.size() + 1; + for(std::size_t i = 0; i <= allowed_errors; ++i) + { + final_states.push_back(string.size() + layer_offset * i); + } + for(std::size_t string_index = 0; string_index < string.size(); + ++string_index) + { + for(std::size_t error_layer = 0; error_layer <= allowed_errors; + ++error_layer) + { + // position string_index matches + nfa.add_transition( + error_layer * layer_offset + string_index, + string[string_index], + error_layer * layer_offset + string_index + 1); + if(error_layer < allowed_errors) + { + // insertion, swap or deletion + nfa.add_arbitrary_transition( + error_layer * layer_offset + string_index, + (error_layer + 1) * layer_offset + string_index); + nfa.add_epsilon_transition( + error_layer * layer_offset + string_index, + (error_layer + 1) * layer_offset + string_index + 1); + nfa.add_arbitrary_transition( + error_layer * layer_offset + string_index, + (error_layer + 1) * layer_offset + string_index + 1); + } + } + } + for(std::size_t error_layer = 0; error_layer < allowed_errors; ++error_layer) + { + // arbitrary transitions between error layers + nfa.add_arbitrary_transition( + error_layer * layer_offset + string.size(), + (error_layer + 1) * layer_offset + string.size()); + } +} + +bool levenshtein_automatont::matches(const std::string &string) const +{ + return get_edit_distance(string).has_value(); +} + +optionalt +levenshtein_automatont::get_edit_distance(const std::string &string) const +{ + auto current = nfa.initial_state(0); + for(const auto c : string) + { + current = nfa.next_state(current, c); + } + for(std::size_t distance = 0; distance < final_states.size(); ++distance) + { + if(current.contains(final_states[distance])) + { + return distance; + } + } + return nullopt; +} diff --git a/src/util/edit_distance.h b/src/util/edit_distance.h new file mode 100644 index 00000000000..2d72586ec79 --- /dev/null +++ b/src/util/edit_distance.h @@ -0,0 +1,46 @@ +/// \file +/// \author Diffblue Ltd. +/// +/// Loosely based on this blog post: +/// http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata +/// Provides a way to compute edit distance between two strings +/// +/// No conversion to DFA or other optimisations are done here because for our +/// use case (i.e. suggestions for errors in command line specifications) this +/// is fast enough without them. + +#ifndef CPROVER_UTIL_EDIT_DISTANCE_H +#define CPROVER_UTIL_EDIT_DISTANCE_H + +#include "nfa.h" + +#include +#include + +#include + +/// Simple automaton that can detect whether a string can be transformed into +/// another with a limited number of deletions, insertions or substitutions. +/// Not a very fast implementation, but should be good enough for small strings. +struct levenshtein_automatont +{ +private: + nfat nfa; + using state_labelt = nfat::state_labelt; + std::vector final_states; + +public: + levenshtein_automatont( + const std::string &string, + std::size_t allowed_errors = 2); + + bool matches(const std::string &string) const; + optionalt get_edit_distance(const std::string &string) const; + + void dump_automaton_dot_to(std::ostream &out) + { + nfa.dump_automaton_dot_to(out); + }; +}; + +#endif // CPROVER_UTIL_EDIT_DISTANCE_H diff --git a/src/util/nfa.h b/src/util/nfa.h new file mode 100644 index 00000000000..d5f52b1128a --- /dev/null +++ b/src/util/nfa.h @@ -0,0 +1,174 @@ +/// \file +/// \author Diffblue Ltd. +/// +/// A simple NFA implementation. +/// +/// This was created for use in the util/edit_distance.h functionality, which +/// in turn is used in util/cmdline.h for suggesting spelling corrections when +/// a user mistypes a command line option. Because of this the implementation +/// wasn’t done with performance in mind and is probably unsuitable as-is for +/// other purposes where performance does matter. + +#ifndef CPROVER_UTIL_NFA_H +#define CPROVER_UTIL_NFA_H + +#include +#include +#include +#include +#include +#include +#include +#include + +/// Very simple NFA implementation +/// Not super performant, but should be good enough for our purposes +template +struct nfat +{ + using state_labelt = std::size_t; + /// A state is a set of possibly active transitions. + /// The automaton itself is stateless, it just contains + /// the state transitions. + struct statet + { + private: + std::unordered_set possible_states; + + public: + friend struct nfat; + + bool contains(state_labelt state_label) const + { + return possible_states.count(state_label) != 0; + } + }; + + nfat() = default; + + /// Add a transition from "from" to "to" exactly when "when" is consumed + void add_transition(state_labelt from, T when, state_labelt to) + { + resize_if_necessary(from, to); + transitions[from].when[when].insert(to); + } + + /// Add a transition from "from" to "to" when any input is consumed + /// This is handled a special case so not all inputs need to be enumerated + /// for arbitrary transitions + void add_arbitrary_transition(state_labelt from, state_labelt to) + { + resize_if_necessary(from, to); + transitions[from].arbitrary.insert(to); + } + + /// Add a transition from "from" to "to" that doesn’t consume input + void add_epsilon_transition(state_labelt from, state_labelt to) + { + resize_if_necessary(from, to); + transitions[from].epsilon.insert(to); + } + + statet initial_state(state_labelt initial) const + { + statet result{}; + result.possible_states.insert(initial); + follow_epsilon_transitions(result); + return result; + } + + statet next_state(const statet ¤t, const T &input) const + { + statet new_state{}; + for(const auto label : current.possible_states) + { + const auto &transition = transitions[label]; + const auto it = transition.when.find(input); + if(it != transition.when.end()) + { + for(const auto result_state : it->second) + { + new_state.possible_states.insert(result_state); + } + } + for(const auto result_state : transition.arbitrary) + { + new_state.possible_states.insert(result_state); + } + } + follow_epsilon_transitions(new_state); + return new_state; + } + + /// Write the automaton structure to out in graphviz dot format. + /// Meant for debugging. + void dump_automaton_dot_to(std::ostream &out) const + { + out << "digraph {\n"; + for(state_labelt from = 0; from < transitions.size(); ++from) + { + for(const auto to : transitions[from].arbitrary) + { + out << 'S' << from << " -> S" << to << "[label=\"*\"]\n"; + } + for(const auto to : transitions[from].epsilon) + { + out << 'S' << from << " -> S" << to << u8"[label=\"ε\"]\n"; + } + for(const auto &pair : transitions[from].when) + { + const auto &in = pair.first; + const auto &tos = pair.second; + for(const auto to : tos) + { + out << 'S' << from << " -> S" << to << "[label=\"(" << in << ")\"]\n"; + } + } + } + out << "}\n"; + } + +private: + void resize_if_necessary(state_labelt from, state_labelt to) + { + const auto min_size = std::max(from, to) + 1; + if(transitions.size() < min_size) + { + transitions.resize(min_size); + } + } + + void follow_epsilon_transitions(statet &state) const + { + std::vector new_states{}; + do + { + new_states.clear(); + for(const auto from : state.possible_states) + { + for(const auto to : transitions[from].epsilon) + { + if(!state.contains(to)) + { + new_states.push_back(to); + } + } + } + std::copy( + new_states.begin(), + new_states.end(), + std::inserter(state.possible_states, state.possible_states.end())); + } while(!new_states.empty()); + } + + struct transitiont + { + std::unordered_set epsilon; + std::unordered_set arbitrary; + std::unordered_map> when; + }; + + std::vector transitions; +}; + +#endif // CPROVER_UTIL_NFA_H diff --git a/src/util/parse_options.cpp b/src/util/parse_options.cpp index 4e083e59bb5..56268628545 100644 --- a/src/util/parse_options.cpp +++ b/src/util/parse_options.cpp @@ -55,7 +55,22 @@ void parse_options_baset::usage_error() void parse_options_baset::unknown_option_msg() { if(!cmdline.unknown_arg.empty()) - log.error() << "Unknown option: " << cmdline.unknown_arg << messaget::eom; + { + log.error() << "Unknown option: " << cmdline.unknown_arg; + auto const suggestions = + cmdline.get_argument_suggestions(cmdline.unknown_arg); + if(!suggestions.empty()) + { + log.error() << ", did you mean "; + if(suggestions.size() > 1) + { + log.error() << "one of "; + } + join_strings(log.error(), suggestions.begin(), suggestions.end(), ", "); + log.error() << "?"; + } + log.error() << messaget::eom; + } } int parse_options_baset::main() diff --git a/unit/Makefile b/unit/Makefile index 246c090bca7..7bcc32b299c 100644 --- a/unit/Makefile +++ b/unit/Makefile @@ -73,6 +73,7 @@ SRC += analyses/ai/ai.cpp \ util/allocate_objects.cpp \ util/cmdline.cpp \ util/dense_integer_map.cpp \ + util/edit_distance.cpp \ util/expr_cast/expr_cast.cpp \ util/expr.cpp \ util/expr_iterator.cpp \ diff --git a/unit/util/edit_distance.cpp b/unit/util/edit_distance.cpp new file mode 100644 index 00000000000..f24ee818517 --- /dev/null +++ b/unit/util/edit_distance.cpp @@ -0,0 +1,139 @@ +/// \file +/// \author Diffblue Ltd. +/// Unit tests for checking if two strings are within a certain edit distance + +#include +#include + +TEST_CASE("edit distance 0", "[core][util][edit_distance]") +{ + auto const hello = levenshtein_automatont{"hello", 0}; + + // Distance 0 + REQUIRE(hello.matches("hello")); + REQUIRE(hello.get_edit_distance("hello").value() == 0); + + // Distance 1 + REQUIRE_FALSE(hello.matches("hallo")); + REQUIRE_FALSE(hello.matches("hell")); + REQUIRE_FALSE(hello.matches("helloo")); + REQUIRE_FALSE(hello.matches("chello")); + + // Distance 2 + REQUIRE_FALSE(hello.matches("helol")); + REQUIRE_FALSE(hello.matches("help")); + REQUIRE_FALSE(hello.matches("yohello")); + + // Distance 3 + REQUIRE_FALSE(hello.matches("kelp")); + REQUIRE_FALSE(hello.matches("hilt")); + REQUIRE_FALSE(hello.matches("wallow")); + + // Distance > 3 + REQUIRE_FALSE(hello.matches("unrelated")); +} + +TEST_CASE("edit distance 1", "[core][util][edit_distance]") +{ + auto const hello = levenshtein_automatont{"hello", 1}; + + // Distance 0 + REQUIRE(hello.matches("hello")); + REQUIRE(hello.get_edit_distance("hello").value() == 0); + + // Distance 1 + REQUIRE(hello.matches("hallo")); + REQUIRE(hello.get_edit_distance("hallo").value() == 1); + REQUIRE(hello.matches("hell")); + REQUIRE(hello.get_edit_distance("hell").value() == 1); + REQUIRE(hello.matches("helloo")); + REQUIRE(hello.get_edit_distance("helloo").value() == 1); + REQUIRE(hello.matches("chello")); + REQUIRE(hello.get_edit_distance("chello").value() == 1); + + // Distance 2 + REQUIRE_FALSE(hello.matches("helol")); + REQUIRE_FALSE(hello.matches("help")); + REQUIRE_FALSE(hello.matches("yohello")); + + // Distance 3 + REQUIRE_FALSE(hello.matches("kelp")); + REQUIRE_FALSE(hello.matches("hilt")); + REQUIRE_FALSE(hello.matches("wallow")); + + // Distance > 3 + REQUIRE_FALSE(hello.matches("unrelated")); +} + +TEST_CASE("edit distance 2", "[core][util][edit_distance]") +{ + auto const hello = levenshtein_automatont{"hello", 2}; + + // Distance 0 + REQUIRE(hello.matches("hello")); + REQUIRE(hello.get_edit_distance("hello").value() == 0); + + // Distance 1 + REQUIRE(hello.matches("hallo")); + REQUIRE(hello.get_edit_distance("hallo").value() == 1); + REQUIRE(hello.matches("hell")); + REQUIRE(hello.get_edit_distance("hell").value() == 1); + REQUIRE(hello.matches("helloo")); + REQUIRE(hello.get_edit_distance("helloo").value() == 1); + REQUIRE(hello.matches("chello")); + REQUIRE(hello.get_edit_distance("chello").value() == 1); + + // Distance 2 + REQUIRE(hello.matches("helol")); + REQUIRE(hello.get_edit_distance("helol").value() == 2); + REQUIRE(hello.matches("help")); + REQUIRE(hello.get_edit_distance("help").value() == 2); + REQUIRE(hello.matches("yohello")); + REQUIRE(hello.get_edit_distance("yohello").value() == 2); + + // Distance 3 + REQUIRE_FALSE(hello.matches("kelp")); + REQUIRE_FALSE(hello.matches("hilt")); + REQUIRE_FALSE(hello.matches("wallow")); + + // Distance > 3 + REQUIRE_FALSE(hello.matches("unrelated")); +} + +TEST_CASE("edit distance 3", "[core][util][edit_distance]") +{ + auto const hello = levenshtein_automatont{"hello", 3}; + + // Distance 0 + REQUIRE(hello.matches("hello")); + REQUIRE(hello.get_edit_distance("hello").value() == 0); + + // Distance 1 + REQUIRE(hello.matches("hallo")); + REQUIRE(hello.get_edit_distance("hallo").value() == 1); + REQUIRE(hello.matches("hell")); + REQUIRE(hello.get_edit_distance("hell").value() == 1); + REQUIRE(hello.matches("helloo")); + REQUIRE(hello.get_edit_distance("helloo").value() == 1); + REQUIRE(hello.matches("chello")); + REQUIRE(hello.get_edit_distance("chello").value() == 1); + + // Distance 2 + REQUIRE(hello.matches("helol")); + REQUIRE(hello.get_edit_distance("helol").value() == 2); + REQUIRE(hello.matches("help")); + REQUIRE(hello.get_edit_distance("help").value() == 2); + REQUIRE(hello.matches("yohello")); + REQUIRE(hello.get_edit_distance("yohello").value() == 2); + + // Distance 3 + REQUIRE(hello.matches("kelp")); + REQUIRE(hello.get_edit_distance("kelp").value() == 3); + REQUIRE(hello.matches("hilt")); + REQUIRE(hello.get_edit_distance("hilt").value() == 3); + REQUIRE(hello.matches("wallow")); + REQUIRE(hello.get_edit_distance("wallow").value() == 3); + + // Distance > 3 + REQUIRE_FALSE(hello.matches("unrelated")); +}