diff --git a/clang-tools-extra/pseudo/CMakeLists.txt b/clang-tools-extra/pseudo/CMakeLists.txt index 24bc1530bb7d6..2bc0f92d063cc 100644 --- a/clang-tools-extra/pseudo/CMakeLists.txt +++ b/clang-tools-extra/pseudo/CMakeLists.txt @@ -1,11 +1,5 @@ include_directories(include) -include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) -add_subdirectory(include) -add_subdirectory(gen) add_subdirectory(lib) -add_subdirectory(tool) -add_subdirectory(fuzzer) -add_subdirectory(benchmarks) if(CLANG_INCLUDE_TESTS) add_subdirectory(unittests) add_subdirectory(test) diff --git a/clang-tools-extra/pseudo/README.md b/clang-tools-extra/pseudo/README.md index 0958f5d500e7f..b5984fdcdc097 100644 --- a/clang-tools-extra/pseudo/README.md +++ b/clang-tools-extra/pseudo/README.md @@ -1,3 +1,10 @@ +# Removed + +This was never completed and most of the implementation has been removed. +This document remains for historical interest, for now. + +See https://docs.google.com/document/d/1eGkTOsFja63wsv8v0vd5JdoTonj-NlN3ujGF0T7xDbM/edit + # clang pseudoparser This directory implements an approximate heuristic parser for C++, based on the diff --git a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp deleted file mode 100644 index 087ab6c250e39..0000000000000 --- a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp +++ /dev/null @@ -1,156 +0,0 @@ -//===--- Benchmark.cpp - clang pseudoparser benchmarks ---------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Benchmark for the overall pseudoparser performance, it also includes other -// important pieces of the pseudoparser (grammar compliation, LR table build -// etc). -// -// Note: make sure to build the benchmark in Release mode. -// -// Usage: -// tools/clang/tools/extra/pseudo/benchmarks/ClangPseudoBenchmark \ -// --grammar=../clang-tools-extra/pseudo/lib/cxx.bnf \ -// --source=../clang/lib/Sema/SemaDecl.cpp -// -//===----------------------------------------------------------------------===// - -#include "benchmark/benchmark.h" -#include "clang-pseudo/Bracket.h" -#include "clang-pseudo/DirectiveTree.h" -#include "clang-pseudo/Forest.h" -#include "clang-pseudo/GLR.h" -#include "clang-pseudo/Token.h" -#include "clang-pseudo/cli/CLI.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "clang-pseudo/grammar/LRTable.h" -#include "clang/Basic/LangOptions.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ErrorOr.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/raw_ostream.h" -#include - -using llvm::cl::desc; -using llvm::cl::opt; -using llvm::cl::Required; - -static opt Source("source", desc("Source file"), Required); - -namespace clang { -namespace pseudo { -namespace bench { -namespace { - -const std::string *SourceText = nullptr; -const Language *Lang = nullptr; - -void setup() { - auto ReadFile = [](llvm::StringRef FilePath) -> std::string { - llvm::ErrorOr> GrammarText = - llvm::MemoryBuffer::getFile(FilePath); - if (std::error_code EC = GrammarText.getError()) { - llvm::errs() << "Error: can't read file '" << FilePath - << "': " << EC.message() << "\n"; - std::exit(1); - } - return GrammarText.get()->getBuffer().str(); - }; - SourceText = new std::string(ReadFile(Source)); - Lang = &getLanguageFromFlags(); -} - -static void buildSLR(benchmark::State &State) { - for (auto _ : State) - LRTable::buildSLR(Lang->G); -} -BENCHMARK(buildSLR); - -TokenStream lexAndPreprocess() { - clang::LangOptions LangOpts = genericLangOpts(); - TokenStream RawStream = pseudo::lex(*SourceText, LangOpts); - auto DirectiveStructure = DirectiveTree::parse(RawStream); - chooseConditionalBranches(DirectiveStructure, RawStream); - TokenStream Cook = - cook(DirectiveStructure.stripDirectives(RawStream), LangOpts); - auto Stream = stripComments(Cook); - pairBrackets(Stream); - return Stream; -} - -static void lex(benchmark::State &State) { - clang::LangOptions LangOpts = genericLangOpts(); - for (auto _ : State) - clang::pseudo::lex(*SourceText, LangOpts); - State.SetBytesProcessed(static_cast(State.iterations()) * - SourceText->size()); -} -BENCHMARK(lex); - -static void pairBrackets(benchmark::State &State) { - clang::LangOptions LangOpts = genericLangOpts(); - auto Stream = clang::pseudo::lex(*SourceText, LangOpts); - for (auto _ : State) - pairBrackets(Stream); - State.SetBytesProcessed(static_cast(State.iterations()) * - SourceText->size()); -} -BENCHMARK(pairBrackets); - -static void preprocess(benchmark::State &State) { - clang::LangOptions LangOpts = genericLangOpts(); - TokenStream RawStream = clang::pseudo::lex(*SourceText, LangOpts); - for (auto _ : State) { - auto DirectiveStructure = DirectiveTree::parse(RawStream); - chooseConditionalBranches(DirectiveStructure, RawStream); - stripComments( - cook(DirectiveStructure.stripDirectives(RawStream), LangOpts)); - } - State.SetBytesProcessed(static_cast(State.iterations()) * - SourceText->size()); -} -BENCHMARK(preprocess); - -static void glrParse(benchmark::State &State) { - SymbolID StartSymbol = *Lang->G.findNonterminal("translation-unit"); - TokenStream Stream = lexAndPreprocess(); - for (auto _ : State) { - pseudo::ForestArena Forest; - pseudo::GSS GSS; - pseudo::glrParse(ParseParams{Stream, Forest, GSS}, StartSymbol, *Lang); - } - State.SetBytesProcessed(static_cast(State.iterations()) * - SourceText->size()); -} -BENCHMARK(glrParse); - -static void full(benchmark::State &State) { - SymbolID StartSymbol = *Lang->G.findNonterminal("translation-unit"); - for (auto _ : State) { - TokenStream Stream = lexAndPreprocess(); - pseudo::ForestArena Forest; - pseudo::GSS GSS; - pseudo::glrParse(ParseParams{Stream, Forest, GSS}, StartSymbol, *Lang); - } - State.SetBytesProcessed(static_cast(State.iterations()) * - SourceText->size()); -} -BENCHMARK(full); - -} // namespace -} // namespace bench -} // namespace pseudo -} // namespace clang - -int main(int argc, char *argv[]) { - benchmark::Initialize(&argc, argv); - llvm::cl::ParseCommandLineOptions(argc, argv); - clang::pseudo::bench::setup(); - benchmark::RunSpecifiedBenchmarks(); - return 0; -} diff --git a/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt b/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt deleted file mode 100644 index 859db991403cd..0000000000000 --- a/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -add_benchmark(ClangPseudoBenchmark Benchmark.cpp) - -target_link_libraries(ClangPseudoBenchmark - PRIVATE - clangPseudo - clangPseudoCLI - clangPseudoGrammar - LLVMSupport - ) diff --git a/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt b/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt deleted file mode 100644 index e1d79873471f0..0000000000000 --- a/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -set(LLVM_LINK_COMPONENTS - FuzzerCLI - Support - ) - -add_llvm_fuzzer(clang-pseudo-fuzzer - Fuzzer.cpp - DUMMY_MAIN Main.cpp - ) - -target_link_libraries(clang-pseudo-fuzzer - PRIVATE - clangPseudo - clangPseudoCLI - clangPseudoGrammar - ) diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp deleted file mode 100644 index 87b9d15480cc3..0000000000000 --- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp +++ /dev/null @@ -1,82 +0,0 @@ -//===-- Fuzzer.cpp - Fuzz the pseudoparser --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/DirectiveTree.h" -#include "clang-pseudo/Forest.h" -#include "clang-pseudo/GLR.h" -#include "clang-pseudo/Token.h" -#include "clang-pseudo/cli/CLI.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "clang-pseudo/grammar/LRTable.h" -#include "clang/Basic/LangOptions.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/raw_ostream.h" -#include - -namespace clang { -namespace pseudo { -namespace { - -class Fuzzer { - clang::LangOptions LangOpts = clang::pseudo::genericLangOpts(); - bool Print; - -public: - Fuzzer(bool Print) : Print(Print) {} - - void operator()(llvm::StringRef Code) { - std::string CodeStr = Code.str(); // Must be null-terminated. - auto RawStream = lex(CodeStr, LangOpts); - auto DirectiveStructure = DirectiveTree::parse(RawStream); - clang::pseudo::chooseConditionalBranches(DirectiveStructure, RawStream); - // FIXME: strip preprocessor directives - auto ParseableStream = - clang::pseudo::stripComments(cook(RawStream, LangOpts)); - - clang::pseudo::ForestArena Arena; - clang::pseudo::GSS GSS; - const Language &Lang = getLanguageFromFlags(); - auto &Root = - glrParse(clang::pseudo::ParseParams{ParseableStream, Arena, GSS}, - *Lang.G.findNonterminal("translation-unit"), Lang); - if (Print) - llvm::outs() << Root.dumpRecursive(Lang.G); - } -}; - -Fuzzer *Fuzz = nullptr; - -} // namespace -} // namespace pseudo -} // namespace clang - -extern "C" { - -// Set up the fuzzer from command line flags: -// -print - used for testing the fuzzer -int LLVMFuzzerInitialize(int *Argc, char ***Argv) { - bool PrintForest = false; - auto ConsumeArg = [&](llvm::StringRef Arg) -> bool { - if (Arg == "-print") { - PrintForest = true; - return true; - } - return false; - }; - *Argc = std::remove_if(*Argv + 1, *Argv + *Argc, ConsumeArg) - *Argv; - - clang::pseudo::Fuzz = new clang::pseudo::Fuzzer(PrintForest); - return 0; -} - -int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) { - (*clang::pseudo::Fuzz)(llvm::StringRef(reinterpret_cast(Data), Size)); - return 0; -} -} diff --git a/clang-tools-extra/pseudo/fuzzer/Main.cpp b/clang-tools-extra/pseudo/fuzzer/Main.cpp deleted file mode 100644 index 542a3007a399f..0000000000000 --- a/clang-tools-extra/pseudo/fuzzer/Main.cpp +++ /dev/null @@ -1,16 +0,0 @@ -//===--- Main.cpp - Entry point to sanity check the fuzzer ----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/FuzzMutate/FuzzerCLI.h" - -extern "C" int LLVMFuzzerInitialize(int *, char ***); -extern "C" int LLVMFuzzerTestOneInput(const uint8_t *, size_t); -int main(int argc, char *argv[]) { - return llvm::runFuzzerOnInputs(argc, argv, LLVMFuzzerTestOneInput, - LLVMFuzzerInitialize); -} diff --git a/clang-tools-extra/pseudo/gen/CMakeLists.txt b/clang-tools-extra/pseudo/gen/CMakeLists.txt deleted file mode 100644 index 3dd615a558751..0000000000000 --- a/clang-tools-extra/pseudo/gen/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -set(LLVM_LINK_COMPONENTS Support) -list(REMOVE_ITEM LLVM_COMMON_DEPENDS clang-tablegen-targets) - -add_clang_executable(clang-pseudo-gen - Main.cpp - ) - -target_link_libraries(clang-pseudo-gen - PRIVATE - clangPseudoGrammar - ) diff --git a/clang-tools-extra/pseudo/gen/Main.cpp b/clang-tools-extra/pseudo/gen/Main.cpp deleted file mode 100644 index 25cb26563837a..0000000000000 --- a/clang-tools-extra/pseudo/gen/Main.cpp +++ /dev/null @@ -1,172 +0,0 @@ -//===--- Main.cpp - Compile BNF grammar -----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This is a tool to compile a BNF grammar, it is used by the build system to -// generate a necessary data bits to statically construct core pieces (Grammar, -// LRTable etc) of the LR parser. -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/grammar/Grammar.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/ToolOutputFile.h" -#include - -using llvm::cl::desc; -using llvm::cl::init; -using llvm::cl::opt; -using llvm::cl::Required; -using llvm::cl::value_desc; -using llvm::cl::values; - -namespace { -enum EmitType { - EmitSymbolList, - EmitGrammarContent, -}; - -opt Grammar("grammar", desc("Parse a BNF grammar file."), - Required); -opt - Emit(desc("which information to emit:"), - values(clEnumValN(EmitSymbolList, "emit-symbol-list", - "Print nonterminal symbols (default)"), - clEnumValN(EmitGrammarContent, "emit-grammar-content", - "Print the BNF grammar content as a string"))); - -opt OutputFilename("o", init("-"), desc("Output"), - value_desc("file")); - -std::string readOrDie(llvm::StringRef Path) { - llvm::ErrorOr> Text = - llvm::MemoryBuffer::getFile(Path); - if (std::error_code EC = Text.getError()) { - llvm::errs() << "Error: can't read grammar file '" << Path - << "': " << EC.message() << "\n"; - ::exit(1); - } - return Text.get()->getBuffer().str(); -} -} // namespace - -namespace clang { -namespace pseudo { -namespace { - -// Mangles a symbol name into a valid identifier. -// -// These follow names in the grammar fairly closely: -// nonterminal: `ptr-declarator` becomes `ptr_declarator`; -// punctuator: `,` becomes `COMMA`; -// keyword: `INT` becomes `INT`; -// terminal: `IDENTIFIER` becomes `IDENTIFIER`; -std::string mangleSymbol(SymbolID SID, const Grammar &G) { - static auto &TokNames = *new std::vector{ -#define TOK(X) llvm::StringRef(#X).upper(), -#define KEYWORD(Keyword, Condition) llvm::StringRef(#Keyword).upper(), -#include "clang/Basic/TokenKinds.def" - }; - if (isToken(SID)) - return TokNames[symbolToToken(SID)]; - std::string Name = G.symbolName(SID).str(); - // translation-unit -> translation_unit - std::replace(Name.begin(), Name.end(), '-', '_'); - return Name; -} - -// Mangles the RHS of a rule definition into a valid identifier. -// -// These are unique only for a fixed LHS. -// e.g. for the grammar rule `ptr-declarator := ptr-operator ptr-declarator`, -// it is `ptr_operator__ptr_declarator`. -std::string mangleRule(RuleID RID, const Grammar &G) { - const auto &R = G.lookupRule(RID); - std::string MangleName = mangleSymbol(R.seq().front(), G); - for (SymbolID S : R.seq().drop_front()) { - MangleName.append("__"); - MangleName.append(mangleSymbol(S, G)); - } - return MangleName; -} - -} // namespace -} // namespace pseudo -} // namespace clang - -int main(int argc, char *argv[]) { - llvm::cl::ParseCommandLineOptions(argc, argv, ""); - - std::string GrammarText = readOrDie(Grammar); - std::vector Diags; - auto G = clang::pseudo::Grammar::parseBNF(GrammarText, Diags); - - if (!Diags.empty()) { - llvm::errs() << llvm::join(Diags, "\n"); - return 1; - } - - std::error_code EC; - llvm::ToolOutputFile Out{OutputFilename, EC, llvm::sys::fs::OF_None}; - if (EC) { - llvm::errs() << EC.message() << '\n'; - return 1; - } - - switch (Emit) { - case EmitSymbolList: - Out.os() << R"cpp( -#ifndef NONTERMINAL -#define NONTERMINAL(NAME, ID) -#endif -#ifndef RULE -#define RULE(LHS, RHS, ID) -#endif -#ifndef EXTENSION -#define EXTENSION(NAME, ID) -#endif -)cpp"; - for (clang::pseudo::SymbolID ID = 0; ID < G.table().Nonterminals.size(); - ++ID) { - Out.os() << llvm::formatv("NONTERMINAL({0}, {1})\n", - clang::pseudo::mangleSymbol(ID, G), ID); - for (const clang::pseudo::Rule &R : G.rulesFor(ID)) { - clang::pseudo::RuleID RID = &R - G.table().Rules.data(); - Out.os() << llvm::formatv("RULE({0}, {1}, {2})\n", - clang::pseudo::mangleSymbol(R.Target, G), - clang::pseudo::mangleRule(RID, G), RID); - } - } - for (clang::pseudo::ExtensionID EID = 1 /*skip the sentinel 0 value*/; - EID < G.table().AttributeValues.size(); ++EID) { - llvm::StringRef Name = G.table().AttributeValues[EID]; - assert(!Name.empty()); - Out.os() << llvm::formatv("EXTENSION({0}, {1})\n", Name, EID); - } - Out.os() << R"cpp( -#undef NONTERMINAL -#undef RULE -#undef EXTENSION -)cpp"; - break; - case EmitGrammarContent: - for (llvm::StringRef Line : llvm::split(GrammarText, '\n')) { - Out.os() << '"'; - Out.os().write_escaped((Line + "\n").str()); - Out.os() << "\"\n"; - } - break; - } - - Out.keep(); - - return 0; -} diff --git a/clang-tools-extra/pseudo/include/CMakeLists.txt b/clang-tools-extra/pseudo/include/CMakeLists.txt deleted file mode 100644 index 2334cfa12e337..0000000000000 --- a/clang-tools-extra/pseudo/include/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -# The cxx.bnf grammar file -set(cxx_bnf ${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxx/cxx.bnf) - -setup_host_tool(clang-pseudo-gen CLANG_PSEUDO_GEN pseudo_gen pseudo_gen_target) - -# Generate inc files. -set(cxx_symbols_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXSymbols.inc) -add_custom_command(OUTPUT ${cxx_symbols_inc} - COMMAND "${pseudo_gen}" - --grammar ${cxx_bnf} - --emit-symbol-list - -o ${cxx_symbols_inc} - COMMENT "Generating nonterminal symbol file for cxx grammar..." - DEPENDS ${pseudo_gen_target} ${cxx_bnf} - VERBATIM) - -set(cxx_bnf_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXBNF.inc) -add_custom_command(OUTPUT ${cxx_bnf_inc} - COMMAND "${pseudo_gen}" - --grammar ${cxx_bnf} - --emit-grammar-content - -o ${cxx_bnf_inc} - COMMENT "Generating bnf string file for cxx grammar..." - DEPENDS ${pseudo_gen_target} ${cxx_bnf} - VERBATIM) - -# add_custom_command does not create a new target, we need to deine a target -# explicitly, so that other targets can depend on it. -add_custom_target(cxx_gen - DEPENDS ${cxx_symbols_inc} ${cxx_bnf_inc} - VERBATIM) diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h b/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h deleted file mode 100644 index 5f3a22c9cabb3..0000000000000 --- a/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h +++ /dev/null @@ -1,64 +0,0 @@ -//===--- Disambiguate.h - Find the best tree in the forest -------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// A GLR parse forest represents every possible parse tree for the source code. -// -// Before we can do useful analysis/editing of the code, we need to pick a -// single tree which we think is accurate. We use three main types of clues: -// -// A) Semantic language rules may restrict which parses are allowed. -// For example, `string string string X` is *grammatical* C++, but only a -// single type-name is allowed in a decl-specifier-sequence. -// Where possible, these interpretations are forbidden by guards. -// Sometimes this isn't possible, or we want our parser to be lenient. -// -// B) Some constructs are rarer, while others are common. -// For example `a::c` is often a template specialization, and rarely a -// double comparison between a, b, and c. -// -// C) Identifier text hints whether they name types/values/templates etc. -// "std" is usually a namespace, a project index may also guide us. -// Hints may be within the document: if one occurrence of 'foo' is a variable -// then the others probably are too. -// (Text need not match: similar CaseStyle can be a weak hint, too). -// -//----------------------------------------------------------------------------// -// -// Mechanically, we replace each ambiguous node with its best alternative. -// -// "Best" is determined by assigning bonuses/penalties to nodes, to express -// the clues of type A and B above. A forest node representing an unlikely -// parse would apply a penalty to every subtree is is present in. -// Disambiguation proceeds bottom-up, so that the score of each alternative -// is known when a decision is made. -// -// Identifier-based hints within the document mean some nodes should be -// *correlated*. Rather than resolve these simultaneously, we make the most -// certain decisions first and use these results to update bonuses elsewhere. -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/Forest.h" - -namespace clang::pseudo { - -struct DisambiguateParams {}; - -// Maps ambiguous nodes onto the index of their preferred alternative. -using Disambiguation = llvm::DenseMap; - -// Resolve each ambiguous node in the forest. -// Maps each ambiguous node to the index of the chosen alternative. -// FIXME: current implementation is a placeholder and chooses arbitrarily. -Disambiguation disambiguate(const ForestNode *Root, - const DisambiguateParams &Params); - -// Remove all ambiguities from the forest, resolving them according to Disambig. -void removeAmbiguities(ForestNode *&Root, const Disambiguation &Disambig); - -} // namespace clang::pseudo diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h deleted file mode 100644 index e9edb40e02b64..0000000000000 --- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h +++ /dev/null @@ -1,236 +0,0 @@ -//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// A parse forest represents a set of possible parse trees efficiently, it is -// produced by the GLR parser. -// -// Despite the name, its data structure is a tree-like DAG with a single root. -// Multiple ways to parse the same tokens are presented as an ambiguous node -// with all possible interpretations as children. -// Common sub-parses are shared: if two interpretations both parse "1 + 1" as -// "expr := expr + expr", they will share a Sequence node representing the expr. -// -//===----------------------------------------------------------------------===// - -#ifndef CLANG_PSEUDO_FOREST_H -#define CLANG_PSEUDO_FOREST_H - -#include "clang-pseudo/Token.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/Allocator.h" -#include - -namespace clang { -namespace pseudo { - -// A node represents ways to parse a sequence of tokens, it interprets a fixed -// range of tokens as a fixed grammar symbol. -// -// There are different kinds of nodes, some nodes have "children" (stored in a -// trailing array) and have pointers to them. "Children" has different semantics -// depending on the node kinds. For an Ambiguous node, it means all -// possible interpretations; for a Sequence node, it means each symbol on the -// right hand side of the production rule. -// -// Since this is a node in a DAG, a node may have multiple parents. And a node -// doesn't have parent pointers. -class alignas(class ForestNode *) ForestNode { -public: - class RecursiveIterator; - enum Kind { - // A Terminal node is a single terminal symbol bound to a token. - Terminal, - // A Sequence node is a nonterminal symbol parsed from a grammar rule, - // elements() are the parses of each symbol on the RHS of the rule. - // If the rule is A := X Y Z, the node is for nonterminal A, and elements() - // are [X, Y, Z]. - Sequence, - // An Ambiguous node exposes multiple ways to interpret the code as the - // same symbol, alternatives() are all possible parses. - Ambiguous, - // An Opaque node is a placeholder. It asserts that tokens match a symbol, - // without saying how. - // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid - // code). - Opaque, - }; - Kind kind() const { return K; } - - SymbolID symbol() const { return Symbol; } - - // The start of the token range, it is a poistion within a token stream. - Token::Index startTokenIndex() const { return StartIndex; } - - // Returns the corresponding grammar rule. - // REQUIRES: this is a Sequence node. - RuleID rule() const { - assert(kind() == Sequence); - return Data & ((1 << RuleBits) - 1); - } - // Returns the parses of each element on the RHS of the rule. - // REQUIRES: this is a Sequence node; - llvm::ArrayRef elements() const { - assert(kind() == Sequence); - return children(Data >> RuleBits); - } - llvm::MutableArrayRef elements() { - assert(kind() == Sequence); - return children(Data >> RuleBits); - } - - // Returns all possible interpretations of the code. - // REQUIRES: this is an Ambiguous node. - llvm::ArrayRef alternatives() const { - assert(kind() == Ambiguous); - return children(Data); - } - llvm::MutableArrayRef alternatives() { - assert(kind() == Ambiguous); - return children(Data); - } - - llvm::ArrayRef children() const { - switch (kind()) { - case Sequence: - return elements(); - case Ambiguous: - return alternatives(); - case Terminal: - case Opaque: - return {}; - } - llvm_unreachable("Bad kind"); - } - - // Iteration over all nodes in the forest, including this. - llvm::iterator_range descendants() const; - - std::string dump(const Grammar &) const; - std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const; - -private: - friend class ForestArena; - - ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data) - : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {} - - ForestNode(const ForestNode &) = delete; - ForestNode &operator=(const ForestNode &) = delete; - ForestNode(ForestNode &&) = delete; - ForestNode &operator=(ForestNode &&) = delete; - - static uint16_t sequenceData(RuleID Rule, - llvm::ArrayRef Elements) { - assert(Rule < (1 << RuleBits)); - assert(Elements.size() < (1 << (16 - RuleBits))); - return Rule | Elements.size() << RuleBits; - } - static uint16_t - ambiguousData(llvm::ArrayRef Alternatives) { - return Alternatives.size(); - } - - // Retrieves the trailing array. - llvm::ArrayRef children(uint16_t Num) const { - return llvm::ArrayRef(reinterpret_cast(this + 1), Num); - } - llvm::MutableArrayRef children(uint16_t Num) { - return llvm::MutableArrayRef(reinterpret_cast(this + 1), - Num); - } - - Token::Index StartIndex; - Kind K : 4; - SymbolID Symbol : SymbolBits; - // Sequence - child count : 4 | RuleID : RuleBits (12) - // Ambiguous - child count : 16 - // Terminal, Opaque - unused - uint16_t Data; - // An array of ForestNode* following the object. -}; -// ForestNode may not be destroyed (for BumpPtrAllocator). -static_assert(std::is_trivially_destructible()); - -// A memory arena for the parse forest. -class ForestArena { -public: - llvm::ArrayRef createTerminals(const TokenStream &Code); - ForestNode &createSequence(SymbolID SID, RuleID RID, - llvm::ArrayRef Elements) { - assert(!Elements.empty()); - return create(ForestNode::Sequence, SID, - Elements.front()->startTokenIndex(), - ForestNode::sequenceData(RID, Elements), Elements); - } - ForestNode &createAmbiguous(SymbolID SID, - llvm::ArrayRef Alternatives) { - assert(!Alternatives.empty()); - assert(llvm::all_of(Alternatives, - [SID](const ForestNode *Alternative) { - return SID == Alternative->symbol(); - }) && - "Ambiguous alternatives must represent the same symbol!"); - return create(ForestNode::Ambiguous, SID, - Alternatives.front()->startTokenIndex(), - ForestNode::ambiguousData(Alternatives), Alternatives); - } - ForestNode &createOpaque(SymbolID SID, Token::Index Start) { - return create(ForestNode::Opaque, SID, Start, 0, {}); - } - - ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) { - return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {}); - } - - size_t nodeCount() const { return NodeCount; } - size_t bytes() const { return Arena.getBytesAllocated() + sizeof(*this); } - -private: - ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start, - uint16_t Data, - llvm::ArrayRef Elements) { - ++NodeCount; - ForestNode *New = new (Arena.Allocate( - sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *), - alignof(ForestNode))) ForestNode(K, SID, Start, Data); - if (!Elements.empty()) - llvm::copy(Elements, reinterpret_cast(New + 1)); - return *New; - } - - llvm::BumpPtrAllocator Arena; - uint32_t NodeCount = 0; -}; - -class ForestNode::RecursiveIterator - : public llvm::iterator_facade_base { - llvm::DenseSet Seen; - struct StackFrame { - const ForestNode *Parent; - unsigned ChildIndex; - }; - std::vector Stack; - const ForestNode *Cur; - -public: - RecursiveIterator(const ForestNode *N = nullptr) : Cur(N) {} - - const ForestNode &operator*() const { return *Cur; } - void operator++(); - bool operator==(const RecursiveIterator &I) const { return Cur == I.Cur; } - bool operator!=(const RecursiveIterator &I) const { return !(*this == I); } -}; - -} // namespace pseudo -} // namespace clang - -#endif // CLANG_PSEUDO_FOREST_H diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h deleted file mode 100644 index 0100f818d4ed7..0000000000000 --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ /dev/null @@ -1,170 +0,0 @@ -//===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This implements a standard Generalized LR (GLR) parsing algorithm. -// -// The GLR parser behaves as a normal LR parser until it encounters a conflict. -// To handle a conflict (where there are multiple actions could perform), the -// parser will simulate nondeterminism by doing a breadth-first search -// over all the possibilities. -// -// Basic mechanisims of the GLR parser: -// - A number of processes are operated in parallel. -// - Each process has its own parsing stack and behaves as a standard -// determinism LR parser. -// - When a process encounters a conflict, it will be fork (one for each -// avaiable action). -// - When a process encounters an error, it is abandoned. -// - All process are synchronized by the lookahead token: they perfrom shift -// action at the same time, which means some processes need wait until other -// processes have performed all reduce actions. -// -//===----------------------------------------------------------------------===// - -#ifndef CLANG_PSEUDO_GLR_H -#define CLANG_PSEUDO_GLR_H - -#include "clang-pseudo/Forest.h" -#include "clang-pseudo/Language.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "clang-pseudo/grammar/LRTable.h" -#include "llvm/Support/Allocator.h" -#include - -namespace clang { -namespace pseudo { - -// A Graph-Structured Stack efficiently represents all parse stacks of a GLR -// parser. -// -// Each node stores a parse state, the last parsed ForestNode, and the parent -// node. There may be several heads (top of stack), and the parser operates by: -// - shift: pushing terminal symbols on top of the stack -// - reduce: replace N symbols on top of the stack with one nonterminal -// -// The structure is a DAG rather than a linear stack: -// - GLR allows multiple actions (conflicts) on the same head, producing forks -// where several nodes have the same parent -// - The parser merges nodes with the same (state, ForestNode), producing joins -// where one node has multiple parents -// -// The parser is responsible for creating nodes and keeping track of the set of -// heads. The GSS class is mostly an arena for them. -struct GSS { - // A node represents a partial parse of the input up to some point. - // - // It is the equivalent of a frame in an LR parse stack. - // Like such a frame, it has an LR parse state and a syntax-tree node - // representing the last parsed symbol (a ForestNode in our case). - // Unlike a regular LR stack frame, it may have multiple parents. - // - // Nodes are not exactly pushed and popped on the stack: pushing is just - // allocating a new head node with a parent pointer to the old head. Popping - // is just forgetting about a node and remembering its parent instead. - struct alignas(struct Node *) Node { - // LR state describing how parsing should continue from this head. - LRTable::StateID State; - // Used internally to track reachability during garbage collection. - bool GCParity; - // Have we already used this node for error recovery? (prevents loops) - mutable bool Recovered = false; - // Number of the parents of this node. - // The parents hold previous parsed symbols, and may resume control after - // this node is reduced. - unsigned ParentCount; - // The parse node for the last parsed symbol. - // This symbol appears on the left of the dot in the parse state's items. - // (In the literature, the node is attached to the *edge* to the parent). - const ForestNode *Payload = nullptr; - - llvm::ArrayRef parents() const { - return llvm::ArrayRef(reinterpret_cast(this + 1), - ParentCount); - }; - // Parents are stored as a trailing array of Node*. - }; - - // Allocates a new node in the graph. - const Node *addNode(LRTable::StateID State, const ForestNode *Symbol, - llvm::ArrayRef Parents); - // Frees all nodes not reachable as ancestors of Roots, and returns the count. - // Calling this periodically prevents steady memory growth of the GSS. - unsigned gc(std::vector &&Roots); - - size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); } - size_t nodesCreated() const { return NodesCreated; } - -private: - // Nodes are recycled using freelists. - // They are variable size, so use one free-list per distinct #parents. - std::vector> FreeList; - Node *allocate(unsigned Parents); - void destroy(Node *N); - // The list of nodes created and not destroyed - our candidates for gc(). - std::vector Alive; - bool GCParity = false; // All nodes should match this, except during GC. - - llvm::BumpPtrAllocator Arena; - unsigned NodesCreated = 0; -}; -llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &); - -// Parameters for the GLR parsing. -struct ParseParams { - // The token stream to parse. - const TokenStream &Code; - - // Arena for data structure used by the GLR algorithm. - ForestArena &Forest; // Storage for the output forest. - GSS &GSStack; // Storage for parsing stacks. -}; - -// Parses the given token stream as the start symbol with the GLR algorithm, -// and returns a forest node of the start symbol. -// -// A rule `_ := StartSymbol` must exit for the chosen start symbol. -// -// If the parsing fails, we model it as an opaque node in the forest. -ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, - const Language &Lang); - -// Shift a token onto all OldHeads, placing the results into NewHeads. -// -// Exposed for testing only. -void glrShift(llvm::ArrayRef OldHeads, - const ForestNode &NextTok, const ParseParams &Params, - const Language &Lang, std::vector &NewHeads); -// Applies available reductions on Heads, appending resulting heads to the list. -// -// Exposed for testing only. -void glrReduce(std::vector &Heads, SymbolID Lookahead, - const ParseParams &Params, const Language &Lang); - -// Heuristically recover from a state where no further parsing is possible. -// -// OldHeads is the parse state at TokenIndex. -// This function consumes zero or more tokens by advancing TokenIndex, -// and places any recovery states created in NewHeads. -// -// On failure, NewHeads is empty and TokenIndex is unchanged. -// -// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens, -// there is a risk of the parser falling into an infinite loop, creating an -// endless sequence of recovery nodes. -// Generally it is safe for recovery to match 0 tokens against sequence symbols -// like `statement-seq`, as the grammar won't permit another statement-seq -// immediately afterwards. However recovery strategies for `statement` should -// consume at least one token, as statements may be adjacent in the input. -void glrRecover(llvm::ArrayRef OldHeads, - unsigned &TokenIndex, const ParseParams &Params, - const Language &Lang, std::vector &NewHeads); - -} // namespace pseudo -} // namespace clang - -#endif // CLANG_PSEUDO_GLR_H diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h deleted file mode 100644 index 1a2b71f081da0..0000000000000 --- a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h +++ /dev/null @@ -1,64 +0,0 @@ -//===--- Language.h -------------------------------------------- -*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef CLANG_PSEUDO_LANGUAGE_H -#define CLANG_PSEUDO_LANGUAGE_H - -#include "clang-pseudo/Token.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "clang-pseudo/grammar/LRTable.h" - -namespace clang { -namespace pseudo { -class ForestNode; -class TokenStream; -class LRTable; - -struct GuardParams { - llvm::ArrayRef RHS; - const TokenStream &Tokens; - // FIXME: use the index of Tokens. - SymbolID Lookahead; -}; -// A guard restricts when a grammar rule can be used. -// -// The GLR parser will use the guard to determine whether a rule reduction will -// be conducted. For example, e.g. a guard may allow the rule -// `virt-specifier := IDENTIFIER` only if the identifier's text is 'override`. -// -// Return true if the guard is satisfied. -using RuleGuard = llvm::function_ref; - -// A recovery strategy determines a region of code to skip when parsing fails. -// -// For example, given `class-def := CLASS IDENT { body [recover=Brackets] }`, -// if parsing fails while attempting to parse `body`, we may skip up to the -// matching `}` and assume everything between was a `body`. -// -// The provided index is the token where the skipped region begins. -// Returns the (excluded) end of the range, or Token::Invalid for no recovery. -using RecoveryStrategy = - llvm::function_ref; - -// Specify a language that can be parsed by the pseduoparser. -struct Language { - Grammar G; - LRTable Table; - - // Binding extension ids to corresponding implementations. - llvm::DenseMap Guards; - llvm::DenseMap RecoveryStrategies; - - // FIXME: add clang::LangOptions. - // FIXME: add default start symbols. -}; - -} // namespace pseudo -} // namespace clang - -#endif // CLANG_PSEUDO_LANGUAGE_H diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h b/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h deleted file mode 100644 index db09aba21502f..0000000000000 --- a/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h +++ /dev/null @@ -1,35 +0,0 @@ -//===--- CLI.h - Get grammar from variant sources ----------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Provides the Grammar, LRTable etc for a language specified by the `--grammar` -// flags. It is by design to be used by pseudoparser-based CLI tools. -// -// The CLI library defines a `--grammar` CLI flag, which supports 1) using a -// grammar from a file (--grammar=/path/to/lang.bnf) or using the prebuilt cxx -// language (--grammar=cxx). -// -//===----------------------------------------------------------------------===// - -#ifndef CLANG_PSEUDO_CLI_CLI_H -#define CLANG_PSEUDO_CLI_CLI_H - -#include "clang-pseudo/Language.h" - -namespace clang { -namespace pseudo { - -// Returns the corresponding Language from the '--grammar' command-line flag. -// -// !! If the grammar flag is invalid (e.g. unexisting file), this function will -// exit the program immediately. -const Language &getLanguageFromFlags(); - -} // namespace pseudo -} // namespace clang - -#endif // CLANG_PSEUDO_CLI_CLI_H diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h b/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h deleted file mode 100644 index 7bbb4d2c00201..0000000000000 --- a/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h +++ /dev/null @@ -1,91 +0,0 @@ -//===--- CXX.h - Public interfaces for the C++ grammar -----------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines public interfaces for the C++ grammar -// (pseudo/lib/cxx/cxx.bnf). It provides a fast way to access core building -// pieces of the LR parser, e.g. Grammar, LRTable, rather than parsing the -// grammar file at the runtime. -// -// We do a compilation of the C++ BNF grammar at build time, and generate -// critical data sources. The implementation of the interfaces are based on the -// generated data sources. -// -// FIXME: not everything is fully compiled yet. The implementation of the -// interfaces are still parsing the grammar file at the runtime. -// -//===----------------------------------------------------------------------===// - -#ifndef CLANG_PSEUDO_CXX_CXX_H -#define CLANG_PSEUDO_CXX_CXX_H - -#include "clang-pseudo/Language.h" -#include "clang-pseudo/grammar/Grammar.h" - -namespace clang { -namespace pseudo { -namespace cxx { - -// We want enums to be scoped but implicitly convertible to RuleID etc. -// So create regular (unscoped) enums inside subnamespaces of `detail`. -// Then add aliases for them outside `detail`. -namespace detail { -namespace symbols { -enum Symbol : SymbolID { -#define NONTERMINAL(X, Y) X = Y, -#include "CXXSymbols.inc" -#undef NONTERMINAL -}; -} // namespace symbols - -namespace extensions { -enum Extension : ExtensionID { -#define EXTENSION(X, Y) X = Y, -#include "CXXSymbols.inc" -#undef EXTENSION -}; -} // namespace extensions - -namespace rules { -// For each symbol we close the last symbol's enum+namespace and open new ones. -// We need a dummy namespace+enum so that this works for the first rule. -namespace dummy { -enum Dummy { -//clang-format off -#define NONTERMINAL(NAME, ID) \ -}; \ -} \ -namespace NAME { \ -enum Rule : RuleID { -//clang-format on -#define RULE(LHS, RHS, ID) RHS = ID, -#include "CXXSymbols.inc" -}; -} -} // namespace rules -} // namespace detail - -// Symbol represents nonterminal symbols in the C++ grammar. -// It provides a simple uniform way to access a particular nonterminal. -using Symbol = detail::symbols::Symbol; - -using Extension = detail::extensions::Extension; - -namespace rule { -#define NONTERMINAL(NAME, ID) using NAME = detail::rules::NAME::Rule; -#include "CXXSymbols.inc" -} // namespace rule - -// Returns the Language for the cxx.bnf grammar. -const Language &getLanguage(); - -} // namespace cxx - -} // namespace pseudo -} // namespace clang - -#endif // CLANG_PSEUDO_CXX_CXX_H diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h deleted file mode 100644 index a1c779a02d864..0000000000000 --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h +++ /dev/null @@ -1,230 +0,0 @@ -//===--- Grammar.h - grammar used by clang pseudoparser ---------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines base structures for parsing & modeling a grammar for a -// programming language: -// -// # This is a fake C++ BNF grammar -// _ := translation-unit -// translation-unit := declaration-seq_opt -// declaration-seq := declaration -// declaration-seq := declaration-seq declaration -// -// A grammar formally describes a language, and it is constructed by a set of -// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either -// nonterminal or terminal, identified by a SymbolID. -// -// Annotations are supported in a syntax form of [key=value]. They specify -// attributes which are associated with either a grammar symbol (on the -// right-hand side of the symbol) or a grammar rule (at the end of the rule -// body). -// Attributes provide a way to inject custom code into the GLR parser. Each -// unique attribute value creates an extension point (identified by ExtensionID -// ), and an extension point corresponds to a piece of native code. For -// example, C++ grammar has a rule: -// -// compound_statement := { statement-seq [recover=Brackets] } -// -// The `recover` attribute instructs the parser that we should perform error -// recovery if parsing the statement-seq fails. The `Brackets` recovery -// heuristic is implemented in CXX.cpp by binding the ExtensionID for the -// `Recovery` value to a specific C++ function that finds the recovery point. -// -// Notions about the BNF grammar: -// - "_" is the start symbol of the augmented grammar; -// - single-line comment is supported, starting with a # -// - A rule describes how a nonterminal (left side of :=) is constructed, and -// it is *per line* in the grammar file -// - Terminals (also called tokens) correspond to the clang::TokenKind; they -// are written in the grammar like "IDENTIFIER", "USING", "+" -// - Nonterminals are specified with "lower-case" names in the grammar; they -// shouldn't be nullable (has an empty sequence) -// - optional symbols are supported (specified with a _opt suffix), and they -// will be eliminated during the grammar parsing stage -// -//===----------------------------------------------------------------------===// - -#ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H -#define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H - -#include "clang/Basic/TokenKinds.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include - -namespace clang { -namespace pseudo { -// A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar. -// nonterminal IDs are indexes into a table of nonterminal symbols. -// Terminal IDs correspond to the clang TokenKind enum. -using SymbolID = uint16_t; -// SymbolID is only 12 bits wide. -// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals. -static constexpr uint16_t SymbolBits = 12; -static constexpr uint16_t NumTerminals = tok::NUM_TOKENS; -// SymbolIDs with the top bit set are tokens/terminals. -static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1); -inline bool isToken(SymbolID ID) { return ID & TokenFlag; } -inline bool isNonterminal(SymbolID ID) { return !isToken(ID); } -// The terminals are always the clang tok::TokenKind (not all are used). -inline tok::TokenKind symbolToToken(SymbolID SID) { - assert(isToken(SID)); - SID &= ~TokenFlag; - assert(SID < NumTerminals); - return static_cast(SID); -} -inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) { - return TokenFlag | static_cast(TK); -} - -// An extension is a piece of native code specific to a grammar that modifies -// the behavior of annotated rules. One ExtensionID is assigned for each unique -// attribute value (all attributes share a namespace). -using ExtensionID = uint16_t; - -// A RuleID uniquely identifies a production rule in a grammar. -// It is an index into a table of rules. -using RuleID = uint16_t; -// There are maximum 2^12 rules. -static constexpr unsigned RuleBits = 12; - -// Represent a production rule in the grammar, e.g. -// expression := a b c -// ^Target ^Sequence -struct Rule { - Rule(SymbolID Target, llvm::ArrayRef Seq); - - // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens - // long, however, we're stricter in order to reduce the size, we limit the max - // length to 9 (this is the longest sequence in cxx grammar). - static constexpr unsigned SizeBits = 4; - static constexpr unsigned MaxElements = 9; - static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit"); - static_assert(SizeBits + SymbolBits <= 16, - "Must be able to store symbol ID + size efficiently"); - - // 16 bits for target symbol and size of sequence: - // SymbolID : 12 | Size : 4 - SymbolID Target : SymbolBits; - uint8_t Size : SizeBits; // Size of the Sequence - SymbolID Sequence[MaxElements]; - - // A guarded rule has extra logic to determine whether the RHS is eligible. - bool Guarded = false; - - // Specifies the index within Sequence eligible for error recovery. - // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we - // should recover by finding the matching brace, and forcing stmt-seq to match - // everything between braces. - // For now, only a single strategy at a single point is possible. - uint8_t RecoveryIndex = -1; - ExtensionID Recovery = 0; - - llvm::ArrayRef seq() const { - return llvm::ArrayRef(Sequence, Size); - } - friend bool operator==(const Rule &L, const Rule &R) { - return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded; - } -}; - -struct GrammarTable; - -// Grammar that describes a programming language, e.g. C++. It represents the -// contents of the specified grammar. -// It is a building block for constructing a table-based parser. -class Grammar { -public: - Grammar() = default; // Creates an invalid dummy grammar. - explicit Grammar(std::unique_ptr); - - // Parses grammar from a BNF file. - // Diagnostics emitted during parsing are stored in Diags. - static Grammar parseBNF(llvm::StringRef BNF, std::vector &Diags); - - // Returns the SymbolID of the symbol '_'. - SymbolID underscore() const { return Underscore; }; - - // Returns all rules of the given nonterminal symbol. - llvm::ArrayRef rulesFor(SymbolID SID) const; - const Rule &lookupRule(RuleID RID) const; - - // Gets symbol (terminal or nonterminal) name. - // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator). - llvm::StringRef symbolName(SymbolID) const; - - // Lookup the SymbolID of the nonterminal symbol by Name. - std::optional findNonterminal(llvm::StringRef Name) const; - - // Dumps the whole grammar. - std::string dump() const; - // Dumps a particular rule. - std::string dumpRule(RuleID) const; - // Dumps all rules of the given nonterminal symbol. - std::string dumpRules(SymbolID) const; - - const GrammarTable &table() const { return *T; } - -private: - std::unique_ptr T; - // The symbol ID of '_'. (In the LR literature, this is the start symbol of - // the augmented grammar.) - SymbolID Underscore; -}; -// For each nonterminal X, computes the set of terminals that begin strings -// derived from X. (Known as FIRST sets in grammar-based parsers). -std::vector> firstSets(const Grammar &); -// For each nonterminal X, computes the set of terminals that could immediately -// follow X. (Known as FOLLOW sets in grammar-based parsers). -std::vector> followSets(const Grammar &); - -// Storage for the underlying data of the Grammar. -// It can be constructed dynamically (from compiling BNF file) or statically -// (a compiled data-source). -struct GrammarTable { - GrammarTable(); - - struct Nonterminal { - std::string Name; - // Corresponding rules that construct the nonterminal, it is a [Start, End) - // index range of the Rules table. - struct { - RuleID Start; - RuleID End; - } RuleRange; - }; - - // RuleID is an index into this table of rule definitions. - // - // Rules with the same target symbol (LHS) are grouped into a single range. - // The relative order of different target symbols is *not* by SymbolID, but - // rather a topological sort: if S := T then the rules producing T have lower - // RuleIDs than rules producing S. - // (This strange order simplifies the GLR parser: for a given token range, if - // we reduce in increasing RuleID order then we need never backtrack -- - // prerequisite reductions are reached before dependent ones). - std::vector Rules; - // A table of terminals (aka tokens). It corresponds to the clang::Token. - // clang::tok::TokenKind is the index of the table. - llvm::ArrayRef Terminals; - // A table of nonterminals, sorted by name. - // SymbolID is the index of the table. - std::vector Nonterminals; - // A table of attribute values, sorted by name. - // ExtensionID is the index of the table. - std::vector AttributeValues; -}; - -} // namespace pseudo -} // namespace clang - -#endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h deleted file mode 100644 index dd9e87c2c172b..0000000000000 --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h +++ /dev/null @@ -1,196 +0,0 @@ -//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// LR parsers are bottom-up parsers -- they scan the input from left to right, -// and collect the right-hand side of a production rule (called handle) on top -// of the stack, then replace (reduce) the handle with the nonterminal defined -// by the production rule. -// -// This file defines LRGraph, a deterministic handle-finding finite-state -// automaton, which is a key component in LR parsers to recognize any of -// handles in the grammar efficiently. We build the LR table (ACTION and GOTO -// Table) based on the LRGraph. -// -// LRGraph can be constructed for any context-free grammars. -// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but -// interpretation of the FSA is nondeterministic -- we might in a state where -// we can continue searching an handle and identify a handle (called -// shift/reduce conflicts), or identify more than one handle (callled -// reduce/reduce conflicts). -// -// LRGraph is a common model for all variants of LR automatons, from the most -// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead -// in making decisions. -//===----------------------------------------------------------------------===// - -#ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H -#define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H - -#include "clang-pseudo/grammar/Grammar.h" -#include "llvm/ADT/Hashing.h" -#include - -namespace clang { -namespace pseudo { - -// An LR item -- a grammar rule with a dot at some position of the body. -// e.g. a production rule A := X Y yields 3 items: -// A := . X Y -// A := X . Y -// A := X Y . -// An item indicates how much of a production rule has been recognized at a -// position (described by dot), for example, A := X . Y indicates that we have -// recognized the X part from the input, and we hope next to see the input -// derivable from Y. -class Item { -public: - static Item start(RuleID ID, const Grammar &G) { - Item I; - I.RID = ID; - I.RuleLength = G.lookupRule(ID).Size; - return I; - } - static Item sentinel(RuleID ID) { - Item I; - I.RID = ID; - return I; - } - - RuleID rule() const { return RID; } - uint8_t dot() const { return DotPos; } - - bool hasNext() const { return DotPos < RuleLength; } - SymbolID next(const Grammar &G) const { - assert(hasNext()); - return G.lookupRule(RID).Sequence[DotPos]; - } - - Item advance() const { - assert(hasNext()); - Item I = *this; - ++I.DotPos; - return I; - } - - std::string dump(const Grammar &G) const; - - bool operator==(const Item &I) const { - return DotPos == I.DotPos && RID == I.RID; - } - bool operator<(const Item &I) const { - return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos); - } - friend llvm::hash_code hash_value(const Item &I) { - return llvm::hash_combine(I.RID, I.DotPos); - } - -private: - RuleID RID = 0; - uint8_t DotPos = 0; - uint8_t RuleLength = 0; // the length of rule body. -}; - -// A state represents a node in the LR automaton graph. It is an item set, which -// contains all possible rules that the LR parser may be parsing in that state. -// -// Conceptually, If we knew in advance what we're parsing, at any point we're -// partway through parsing a production, sitting on a stack of partially parsed -// productions. But because we don't know, there could be *several* productions -// we're partway through. The set of possibilities is the parser state, and we -// precompute all the transitions between these states. -struct State { - // A full set of items (including non-kernel items) representing the state, - // in a canonical order (see SortByNextSymbol in the cpp file). - std::vector Items; - - std::string dump(const Grammar &G, unsigned Indent = 0) const; -}; - -// LRGraph is a deterministic finite state automaton for LR parsing. -// -// Intuitively, an LR automaton is a transition graph. The graph has a -// collection of nodes, called States. Each state corresponds to a particular -// item set, which represents a condition that could occur during the process of -// parsing a production. Edges are directed from one state to another. Each edge -// is labeled by a grammar symbol (terminal or nonterminal). -// -// LRGraph is used to construct the LR parsing table which is a core -// data-structure driving the LR parser. -class LRGraph { -public: - // StateID is the index in States table. - using StateID = uint16_t; - - // Constructs an LR(0) automaton. - static LRGraph buildLR0(const Grammar &); - - // An edge in the LR graph, it represents a transition in the LR automaton. - // If the parser is at state Src, with a lookahead Label, then it - // transits to state Dst. - struct Edge { - StateID Src, Dst; - SymbolID Label; - }; - - // A possible error recovery: choose to match some tokens against a symbol. - // - // e.g. a state that contains - // stmt := { . stmt-seq [recover=braces] } - // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }. - struct Recovery { - StateID Src; // The state we are in when encountering the error. - ExtensionID Strategy; // Heuristic choosing the tokens to match. - SymbolID Result; // The symbol that is produced. - }; - - llvm::ArrayRef states() const { return States; } - llvm::ArrayRef edges() const { return Edges; } - llvm::ArrayRef recoveries() const { return Recoveries; } - llvm::ArrayRef> startStates() const { - return StartStates; - } - - std::string dumpForTests(const Grammar &) const; - -private: - LRGraph(std::vector States, std::vector Edges, - std::vector Recoveries, - std::vector> StartStates) - : States(std::move(States)), Edges(std::move(Edges)), - Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) { - } - - std::vector States; - std::vector Edges; - std::vector Recoveries; - std::vector> StartStates; -}; - -} // namespace pseudo -} // namespace clang - -namespace llvm { -// Support clang::pseudo::Item as DenseMap keys. -template <> struct DenseMapInfo { - static inline clang::pseudo::Item getEmptyKey() { - return clang::pseudo::Item::sentinel(-1); - } - static inline clang::pseudo::Item getTombstoneKey() { - return clang::pseudo::Item::sentinel(-2); - } - static unsigned getHashValue(const clang::pseudo::Item &I) { - return hash_value(I); - } - static bool isEqual(const clang::pseudo::Item &LHS, - const clang::pseudo::Item &RHS) { - return LHS == RHS; - } -}; -} // namespace llvm - -#endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h deleted file mode 100644 index 1706b6936c9ea..0000000000000 --- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h +++ /dev/null @@ -1,278 +0,0 @@ -//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The LRTable (referred as LR parsing table in the LR literature) is the core -// component in LR parsers, it drives the LR parsers by specifying an action to -// take given the current state on the top of the stack and the current -// lookahead token. -// -// The LRTable can be described as a matrix where the rows represent -// the states of the LR graph, the columns represent the symbols of the -// grammar, and each entry of the matrix (called action) represents a -// state transition in the graph. -// -// Typically, based on the category of the grammar symbol, the LRTable is -// broken into two logically separate tables: -// - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies -// next action (shift/reduce) on state S under a lookahead terminal a -// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies -// the state which we transist to from the state S with the nonterminal X -// -// LRTable is *performance-critial* as it is consulted frequently during a -// parse. In general, LRTable is very sparse (most of the entries are empty). -// For example, for the C++ language, the SLR table has ~1500 states and 650 -// symbols which results in a matrix having 975K entries, ~90% of entries are -// empty. -// -// This file implements a speed-and-space-efficient LRTable. -// -//===----------------------------------------------------------------------===// - -#ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H -#define CLANG_PSEUDO_GRAMMAR_LRTABLE_H - -#include "clang-pseudo/grammar/Grammar.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/Support/Capacity.h" -#include "llvm/Support/MathExtras.h" -#include -#include - -namespace clang { -namespace pseudo { - -// Represents the LR parsing table, which can efficiently the question "what is -// the next step given the lookahead token and current state on top of the -// stack?". -// -// This is a dense implementation, which only takes an amount of space that is -// proportional to the number of non-empty entries in the table. -// -// Unlike the typical LR parsing table which allows at most one available action -// per entry, conflicted actions are allowed in LRTable. The LRTable is designed -// to be used in nondeterministic LR parsers (e.g. GLR). -// -// There are no "accept" actions in the LRTable, instead the stack is inspected -// after parsing completes: is the state goto(StartState, StartSymbol)? -class LRTable { -public: - // StateID is only 13 bits wide. - using StateID = uint16_t; - static constexpr unsigned StateBits = 13; - - struct Recovery { - ExtensionID Strategy; - SymbolID Result; - }; - - // Returns the state after we reduce a nonterminal. - // Expected to be called by LR parsers. - // If the nonterminal is invalid here, returns std::nullopt. - std::optional getGoToState(StateID State, - SymbolID Nonterminal) const { - return Gotos.get(gotoIndex(State, Nonterminal, numStates())); - } - // Returns the state after we shift a terminal. - // Expected to be called by LR parsers. - // If the terminal is invalid here, returns std::nullopt. - std::optional getShiftState(StateID State, - SymbolID Terminal) const { - return Shifts.get(shiftIndex(State, Terminal, numStates())); - } - - // Returns the possible reductions from a state. - // - // These are not keyed by a lookahead token. Instead, call canFollow() to - // check whether a reduction should apply in the current context: - // for (RuleID R : LR.getReduceRules(S)) { - // if (!LR.canFollow(G.lookupRule(R).Target, NextToken)) - // continue; - // // ...apply reduce... - // } - llvm::ArrayRef getReduceRules(StateID State) const { - assert(State + 1u < ReduceOffset.size()); - return llvm::ArrayRef(Reduces.data() + ReduceOffset[State], - Reduces.data() + ReduceOffset[State + 1]); - } - // Returns whether Terminal can follow Nonterminal in a valid source file. - bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const { - assert(isToken(Terminal)); - assert(isNonterminal(Nonterminal)); - // tok::unknown is a sentinel value used in recovery: can follow anything. - return Terminal == tokenSymbol(tok::unknown) || - FollowSets.test(tok::NUM_TOKENS * Nonterminal + - symbolToToken(Terminal)); - } - - // Looks up available recovery actions if we stopped parsing in this state. - llvm::ArrayRef getRecovery(StateID State) const { - return llvm::ArrayRef(Recoveries.data() + RecoveryOffset[State], - Recoveries.data() + RecoveryOffset[State + 1]); - } - - // Returns the state from which the LR parser should start to parse the input - // tokens as the given StartSymbol. - // - // In LR parsing, the start state of `translation-unit` corresponds to - // `_ := • translation-unit`. - // - // Each start state responds to **a** single grammar rule like `_ := start`. - // REQUIRE: The given StartSymbol must exist in the grammar (in a form of - // `_ := start`). - StateID getStartState(SymbolID StartSymbol) const; - - size_t bytes() const { - return sizeof(*this) + Gotos.bytes() + Shifts.bytes() + - llvm::capacity_in_bytes(Reduces) + - llvm::capacity_in_bytes(ReduceOffset) + - llvm::capacity_in_bytes(FollowSets); - } - - std::string dumpStatistics() const; - std::string dumpForTests(const Grammar &G) const; - - // Build a SLR(1) parsing table. - static LRTable buildSLR(const Grammar &G); - - // Helper for building a table with specified actions/states. - struct Builder { - Builder() = default; - Builder(const Grammar &G) { - NumNonterminals = G.table().Nonterminals.size(); - FollowSets = followSets(G); - } - - unsigned int NumNonterminals = 0; - // States representing `_ := . start` for various start symbols. - std::vector> StartStates; - // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`. - // Key is (initial state, D), value is final state. - llvm::DenseMap, StateID> Transition; - // Reductions available in a given state. - llvm::DenseMap> Reduce; - // FollowSets[NT] is the set of terminals that can follow the nonterminal. - std::vector> FollowSets; - // Recovery options available at each state. - std::vector> Recoveries; - - LRTable build() &&; - }; - -private: - unsigned numStates() const { return ReduceOffset.size() - 1; } - - // A map from unsigned key => StateID, used to store actions. - // The keys should be sequential but the values are somewhat sparse. - // - // In practice, the keys encode (origin state, symbol) pairs, and the values - // are the state we should move to after seeing that symbol. - // - // We store one bit for presence/absence of the value for each key. - // At every 64th key, we store the offset into the table of values. - // e.g. key 0x500 is checkpoint 0x500/64 = 20 - // Checkpoints[20] = 34 - // get(0x500) = Values[34] (assuming it has a value) - // To look up values in between, we count the set bits: - // get(0x509) has a value if HasValue[20] & (1<<9) - // #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1)) - // get(0x509) = Values[34 + popcnt(...)] - // - // Overall size is 1.25 bits/key + 16 bits/value. - // Lookup is constant time with a low factor (no hashing). - class TransitionTable { - using Word = uint64_t; - constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word); - - std::vector Values; - std::vector HasValue; - std::vector Checkpoints; - - public: - TransitionTable() = default; - TransitionTable(const llvm::DenseMap &Entries, - unsigned NumKeys) { - assert( - Entries.size() < - std::numeric_limits::max() && - "16 bits too small for value offsets!"); - unsigned NumWords = (NumKeys + WordBits - 1) / WordBits; - HasValue.resize(NumWords, 0); - Checkpoints.reserve(NumWords); - Values.reserve(Entries.size()); - for (unsigned I = 0; I < NumKeys; ++I) { - if ((I % WordBits) == 0) - Checkpoints.push_back(Values.size()); - auto It = Entries.find(I); - if (It != Entries.end()) { - HasValue[I / WordBits] |= (Word(1) << (I % WordBits)); - Values.push_back(It->second); - } - } - } - - std::optional get(unsigned Key) const { - // Do we have a value for this key? - Word KeyMask = Word(1) << (Key % WordBits); - unsigned KeyWord = Key / WordBits; - if ((HasValue[KeyWord] & KeyMask) == 0) - return std::nullopt; - // Count the number of values since the checkpoint. - Word BelowKeyMask = KeyMask - 1; - unsigned CountSinceCheckpoint = - llvm::popcount(HasValue[KeyWord] & BelowKeyMask); - // Find the value relative to the last checkpoint. - return Values[Checkpoints[KeyWord] + CountSinceCheckpoint]; - } - - unsigned size() const { return Values.size(); } - - size_t bytes() const { - return llvm::capacity_in_bytes(HasValue) + - llvm::capacity_in_bytes(Values) + - llvm::capacity_in_bytes(Checkpoints); - } - }; - // Shift and Goto tables are keyed by encoded (State, Symbol). - static unsigned shiftIndex(StateID State, SymbolID Terminal, - unsigned NumStates) { - return NumStates * symbolToToken(Terminal) + State; - } - static unsigned gotoIndex(StateID State, SymbolID Nonterminal, - unsigned NumStates) { - assert(isNonterminal(Nonterminal)); - return NumStates * Nonterminal + State; - } - TransitionTable Shifts; - TransitionTable Gotos; - - // A sorted table, storing the start state for each target parsing symbol. - std::vector> StartStates; - - // Given a state ID S, the half-open range of Reduces is - // [ReduceOffset[S], ReduceOffset[S+1]) - std::vector ReduceOffset; - std::vector Reduces; - // Conceptually this is a bool[SymbolID][Token], each entry describing whether - // the grammar allows the (nonterminal) symbol to be followed by the token. - // - // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token) - // as an index: Nonterminal * NUM_TOKENS + Token. - llvm::BitVector FollowSets; - - // Recovery stores all recovery actions from all states. - // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]). - std::vector RecoveryOffset; - std::vector Recoveries; -}; - -} // namespace pseudo -} // namespace clang - -#endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt index f92f79be12150..0f56728d0eceb 100644 --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -1,22 +1,14 @@ -add_subdirectory(cli) -add_subdirectory(cxx) -add_subdirectory(grammar) - set(LLVM_LINK_COMPONENTS Support) add_clang_library(clangPseudo Bracket.cpp DirectiveTree.cpp - Disambiguate.cpp - Forest.cpp - GLR.cpp Lex.cpp Token.cpp LINK_LIBS clangBasic clangLex - clangPseudoGrammar DEPENDS ClangDriverOptions diff --git a/clang-tools-extra/pseudo/lib/Disambiguate.cpp b/clang-tools-extra/pseudo/lib/Disambiguate.cpp deleted file mode 100644 index b0bc75cf96c93..0000000000000 --- a/clang-tools-extra/pseudo/lib/Disambiguate.cpp +++ /dev/null @@ -1,48 +0,0 @@ -//===--- Disambiguate.cpp - Find the best tree in the forest --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/Disambiguate.h" - -namespace clang::pseudo { - -Disambiguation disambiguate(const ForestNode *Root, - const DisambiguateParams &Params) { - // FIXME: this is a dummy placeholder strategy, implement a real one! - Disambiguation Result; - for (const ForestNode &N : Root->descendants()) { - if (N.kind() == ForestNode::Ambiguous) - Result.try_emplace(&N, 1); - } - return Result; -} - -void removeAmbiguities(ForestNode *&Root, const Disambiguation &D) { - std::vector Queue = {&Root}; - while (!Queue.empty()) { - ForestNode **Next = Queue.back(); - Queue.pop_back(); - switch ((*Next)->kind()) { - case ForestNode::Sequence: - for (ForestNode *&Child : (*Next)->elements()) - Queue.push_back(&Child); - break; - case ForestNode::Ambiguous: { - assert(D.count(*Next) != 0 && "disambiguation is incomplete!"); - ForestNode *ChosenChild = (*Next)->alternatives()[D.lookup(*Next)]; - *Next = ChosenChild; - Queue.push_back(Next); - break; - } - case ForestNode::Terminal: - case ForestNode::Opaque: - break; - } - } -} - -} // namespace clang::pseudo diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp deleted file mode 100644 index e8e60e5ec475a..0000000000000 --- a/clang-tools-extra/pseudo/lib/Forest.cpp +++ /dev/null @@ -1,199 +0,0 @@ -//===--- Forest.cpp - Parse forest ------------------------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/Forest.h" -#include "clang-pseudo/Token.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/FormatVariadic.h" -#include - -namespace clang { -namespace pseudo { - -void ForestNode::RecursiveIterator::operator++() { - auto C = Cur->children(); - // Try to find a child of the current node to descend into. - for (unsigned I = 0; I < C.size(); ++I) { - if (Seen.insert(C[I]).second) { - Stack.push_back({Cur, I}); - Cur = C[I]; - return; - } - } - // Try to find a sibling af an ancestor to advance to. - for (; !Stack.empty(); Stack.pop_back()) { - C = Stack.back().Parent->children(); - unsigned &Index = Stack.back().ChildIndex; - while (++Index < C.size()) { - if (Seen.insert(C[Index]).second) { - Cur = C[Index]; - return; - } - } - } - Cur = nullptr; -} - -llvm::iterator_range -ForestNode::descendants() const { - return {RecursiveIterator(this), RecursiveIterator()}; -} - -std::string ForestNode::dump(const Grammar &G) const { - switch (kind()) { - case Ambiguous: - return llvm::formatv("{0} := ", G.symbolName(symbol())); - case Terminal: - return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()), - startTokenIndex()); - case Sequence: - return G.dumpRule(rule()); - case Opaque: - return llvm::formatv("{0} := ", G.symbolName(symbol())); - } - llvm_unreachable("Unhandled node kind!"); -} - -std::string ForestNode::dumpRecursive(const Grammar &G, - bool Abbreviated) const { - using llvm::formatv; - Token::Index MaxToken = 0; - // Count visits of nodes so we can mark those seen multiple times. - llvm::DenseMap VisitCounts; - std::function CountVisits = - [&](const ForestNode *P) { - MaxToken = std::max(MaxToken, P->startTokenIndex()); - if (VisitCounts[P]++ > 0) - return; // Don't count children as multiply visited. - if (P->kind() == Ambiguous) - llvm::for_each(P->alternatives(), CountVisits); - else if (P->kind() == Sequence) - llvm::for_each(P->elements(), CountVisits); - }; - CountVisits(this); - - unsigned IndexWidth = std::max(3, (int)std::to_string(MaxToken).size()); - // e.g. "[{0,4}, {1,4})" if MaxToken is 5742. - std::string RangeFormat = formatv("[{{0,{0}}, {{1,{0}}) ", IndexWidth); - - // The box-drawing characters that should be added as a child is rendered. - struct LineDecoration { - std::string Prefix; // Prepended to every line. - llvm::StringRef First; // added to the child's line. - llvm::StringRef Subsequent; // added to descendants' lines. - }; - - // We print a "#" for nonterminal forest nodes that are being dumped - // multiple times. - llvm::DenseMap ReferenceIds; - std::string Result; - constexpr Token::Index KEnd = std::numeric_limits::max(); - std::function, - LineDecoration &LineDec)> - Dump = [&](const ForestNode *P, Token::Index End, - std::optional ElidedParent, LineDecoration LineDec) { - bool SharedNode = VisitCounts.find(P)->getSecond() > 1; - llvm::ArrayRef Children; - auto EndOfElement = [&](size_t ChildIndex) { - return ChildIndex + 1 == Children.size() - ? End - : Children[ChildIndex + 1]->startTokenIndex(); - }; - if (P->kind() == Ambiguous) { - Children = P->alternatives(); - } else if (P->kind() == Sequence) { - Children = P->elements(); - if (Abbreviated) { - // Abbreviate chains of trivial sequence nodes. - // A := B, B := C, C := D, D := X Y Z - // becomes - // A~D := X Y Z - // - // We can't hide nodes that appear multiple times in the tree, - // because we need to call out their identity with IDs. - if (Children.size() == 1 && !SharedNode) { - assert(Children[0]->startTokenIndex() == P->startTokenIndex() && - EndOfElement(0) == End); - return Dump(Children[0], End, - /*ElidedParent=*/ElidedParent.value_or(P->symbol()), - LineDec); - } - } - } - - if (End == KEnd) - Result += formatv(RangeFormat.c_str(), P->startTokenIndex(), "end"); - else - Result += formatv(RangeFormat.c_str(), P->startTokenIndex(), End); - Result += LineDec.Prefix; - Result += LineDec.First; - if (ElidedParent) { - Result += G.symbolName(*ElidedParent); - Result += "~"; - } - - if (SharedNode && P->kind() != ForestNode::Terminal) { - auto It = ReferenceIds.try_emplace(P, ReferenceIds.size() + 1); - bool First = It.second; - unsigned ID = It.first->second; - - // The first time, print as #1. Later, =#1. - if (First) { - Result += formatv("{0} #{1}", P->dump(G), ID); - } else { - Result += formatv("{0} =#{1}", G.symbolName(P->symbol()), ID); - Children = {}; // Don't walk the children again. - } - } else { - Result.append(P->dump(G)); - } - Result.push_back('\n'); - - auto OldPrefixSize = LineDec.Prefix.size(); - LineDec.Prefix += LineDec.Subsequent; - for (size_t I = 0; I < Children.size(); ++I) { - if (I == Children.size() - 1) { - LineDec.First = "└─"; - LineDec.Subsequent = " "; - } else { - LineDec.First = "├─"; - LineDec.Subsequent = "│ "; - } - Dump(Children[I], P->kind() == Sequence ? EndOfElement(I) : End, - std::nullopt, LineDec); - } - LineDec.Prefix.resize(OldPrefixSize); - }; - LineDecoration LineDec; - Dump(this, KEnd, std::nullopt, LineDec); - return Result; -} - -llvm::ArrayRef -ForestArena::createTerminals(const TokenStream &Code) { - ForestNode *Terminals = Arena.Allocate(Code.tokens().size() + 1); - size_t Index = 0; - for (const auto &T : Code.tokens()) { - new (&Terminals[Index]) - ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind), - /*Start=*/Index, /*TerminalData*/ 0); - ++Index; - } - // Include an `eof` terminal. - // This is important to drive the final shift/recover/reduce loop. - new (&Terminals[Index]) - ForestNode(ForestNode::Terminal, tokenSymbol(tok::eof), - /*Start=*/Index, /*TerminalData*/ 0); - ++Index; - NodeCount = Index; - return llvm::ArrayRef(Terminals, Index); -} - -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp deleted file mode 100644 index ac43c02db521e..0000000000000 --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ /dev/null @@ -1,772 +0,0 @@ -//===--- GLR.cpp -----------------------------------------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/GLR.h" -#include "clang-pseudo/Language.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "clang-pseudo/grammar/LRTable.h" -#include "clang/Basic/TokenKinds.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/ScopeExit.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/FormatVariadic.h" -#include -#include -#include -#include - -#define DEBUG_TYPE "GLR.cpp" - -namespace clang { -namespace pseudo { -namespace { - -Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin, - const TokenStream &Tokens, - const Language &Lang) { - assert(Strategy != 0); - if (auto S = Lang.RecoveryStrategies.lookup(Strategy)) - return S(Begin, Tokens); - return Token::Invalid; -} - -} // namespace - -void glrRecover(llvm::ArrayRef OldHeads, - unsigned &TokenIndex, const ParseParams &Params, - const Language &Lang, - std::vector &NewHeads) { - LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n"); - // Describes a possibility to recover by forcibly interpreting a range of - // tokens around the cursor as a nonterminal that we expected to see. - struct PlaceholderRecovery { - // The token prior to the nonterminal which is being recovered. - // This starts of the region we're skipping, so higher Position is better. - Token::Index Position; - // The nonterminal which will be created in order to recover. - SymbolID Symbol; - // The heuristic used to choose the bounds of the nonterminal to recover. - ExtensionID Strategy; - - // The GSS head where we are expecting the recovered nonterminal. - const GSS::Node *RecoveryNode; - // Payload of nodes on the way back from the OldHead to the recovery node. - // These represent the partial parse that is being discarded. - // They should become the children of the opaque recovery node. - // FIXME: internal structure of opaque nodes is not implemented. - // - // There may be multiple paths leading to the same recovery node, we choose - // one arbitrarily. - std::vector DiscardedParse; - }; - std::vector Options; - - // Find recovery options by walking up the stack. - // - // This is similar to exception handling: we walk up the "frames" of nested - // rules being parsed until we find one that has a "handler" which allows us - // to determine the node bounds without parsing it. - // - // Unfortunately there's a significant difference: the stack contains both - // "upward" nodes (ancestor parses) and "leftward" ones. - // e.g. when parsing `{ if (1) ? }` as compound-stmt, the stack contains: - // stmt := IF ( expr ) . stmt - current state, we should recover here! - // stmt := IF ( expr . ) stmt - (left, no recovery here) - // stmt := IF ( . expr ) stmt - left, we should NOT recover here! - // stmt := IF . ( expr ) stmt - (left, no recovery here) - // stmt-seq := . stmt - up, we might recover here - // compound-stmt := { . stmt-seq } - up, we should recover here! - // - // It's not obvious how to avoid collecting "leftward" recovery options. - // I think the distinction is ill-defined after merging items into states. - // For now, we have to take this into account when defining recovery rules. - // (e.g. in the expr recovery above, stay inside the parentheses). - // FIXME: find a more satisfying way to avoid such false recovery. - // FIXME: Add a test for spurious recovery once tests can define strategies. - std::vector Path; - llvm::DenseSet Seen; - auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) { - if (!Seen.insert(N).second) - return; - if (!N->Recovered) { // Don't recover the same way twice! - for (auto Strategy : Lang.Table.getRecovery(N->State)) { - Options.push_back(PlaceholderRecovery{ - NextTok, - Strategy.Result, - Strategy.Strategy, - N, - Path, - }); - LLVM_DEBUG(llvm::dbgs() - << "Option: recover " << Lang.G.symbolName(Strategy.Result) - << " at token " << NextTok << "\n"); - } - } - Path.push_back(N->Payload); - for (const GSS::Node *Parent : N->parents()) - WalkUp(Parent, N->Payload->startTokenIndex(), WalkUp); - Path.pop_back(); - }; - for (auto *N : OldHeads) - WalkUp(N, TokenIndex, WalkUp); - - // Now we select the option(s) we will use to recover. - // - // We prefer options starting further right, as these discard less code - // (e.g. we prefer to recover inner scopes rather than outer ones). - // The options also need to agree on an endpoint, so the parser has a - // consistent position afterwards. - // - // So conceptually we're sorting by the tuple (start, end), though we avoid - // computing `end` for options that can't be winners. - - // Consider options starting further right first. - // Don't drop the others yet though, we may still use them if preferred fails. - llvm::stable_sort(Options, [&](const auto &L, const auto &R) { - return L.Position > R.Position; - }); - - // We may find multiple winners, but they will have the same range. - std::optional RecoveryRange; - std::vector BestOptions; - for (const PlaceholderRecovery &Option : Options) { - // If this starts further left than options we've already found, then - // we'll never find anything better. Skip computing End for the rest. - if (RecoveryRange && Option.Position < RecoveryRange->Begin) - break; - - auto End = findRecoveryEndpoint(Option.Strategy, Option.Position, - Params.Code, Lang); - // Recovery may not take the parse backwards. - if (End == Token::Invalid || End < TokenIndex) - continue; - if (RecoveryRange) { - // If this is worse than our previous options, ignore it. - if (RecoveryRange->End < End) - continue; - // If this is an improvement over our previous options, then drop them. - if (RecoveryRange->End > End) - BestOptions.clear(); - } - // Create recovery nodes and heads for them in the GSS. These may be - // discarded if a better recovery is later found, but this path isn't hot. - RecoveryRange = {Option.Position, End}; - BestOptions.push_back(&Option); - } - - if (BestOptions.empty()) { - LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size() - << " strategies\n"); - return; - } - - // We've settled on a set of recovery options, so create their nodes and - // advance the cursor. - LLVM_DEBUG({ - llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":"; - for (const auto *Option : BestOptions) - llvm::dbgs() << " " << Lang.G.symbolName(Option->Symbol); - llvm::dbgs() << "\n"; - }); - // FIXME: in general, we might have the same Option->Symbol multiple times, - // and we risk creating redundant Forest and GSS nodes. - // We also may inadvertently set up the next glrReduce to create a sequence - // node duplicating an opaque node that we're creating here. - // There are various options, including simply breaking ties between options. - // For now it's obscure enough to ignore. - for (const PlaceholderRecovery *Option : BestOptions) { - Option->RecoveryNode->Recovered = true; - const ForestNode &Placeholder = - Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin); - LRTable::StateID OldState = Option->RecoveryNode->State; - LRTable::StateID NewState = - isToken(Option->Symbol) - ? *Lang.Table.getShiftState(OldState, Option->Symbol) - : *Lang.Table.getGoToState(OldState, Option->Symbol); - const GSS::Node *NewHead = - Params.GSStack.addNode(NewState, &Placeholder, {Option->RecoveryNode}); - NewHeads.push_back(NewHead); - } - TokenIndex = RecoveryRange->End; -} - -using StateID = LRTable::StateID; - -llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) { - std::vector ParentStates; - for (const auto *Parent : N.parents()) - ParentStates.push_back(llvm::formatv("{0}", Parent->State)); - OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State, - N.Payload ? N.Payload->symbol() : 0, - llvm::join(ParentStates, ", ")); - return OS; -} - -// Apply all pending shift actions. -// In theory, LR parsing doesn't have shift/shift conflicts on a single head. -// But we may have multiple active heads, and each head has a shift action. -// -// We merge the stack -- if multiple heads will reach the same state after -// shifting a token, we shift only once by combining these heads. -// -// E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4: -// 0---1---2 -// └---3 -// After the shift action, the GSS is: -// 0---1---2---4 -// └---3---┘ -void glrShift(llvm::ArrayRef OldHeads, - const ForestNode &NewTok, const ParseParams &Params, - const Language &Lang, std::vector &NewHeads) { - assert(NewTok.kind() == ForestNode::Terminal); - LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n", - Lang.G.symbolName(NewTok.symbol()), - OldHeads.size())); - - // We group pending shifts by their target state so we can merge them. - llvm::SmallVector, 8> Shifts; - for (const auto *H : OldHeads) - if (auto S = Lang.Table.getShiftState(H->State, NewTok.symbol())) - Shifts.push_back({*S, H}); - llvm::stable_sort(Shifts, llvm::less_first{}); - - auto Rest = llvm::ArrayRef(Shifts); - llvm::SmallVector Parents; - while (!Rest.empty()) { - // Collect the batch of PendingShift that have compatible shift states. - // Their heads become TempParents, the parents of the new GSS node. - StateID NextState = Rest.front().first; - - Parents.clear(); - for (const auto &Base : Rest) { - if (Base.first != NextState) - break; - Parents.push_back(Base.second); - } - Rest = Rest.drop_front(Parents.size()); - - LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n", - NextState, Parents.size())); - NewHeads.push_back(Params.GSStack.addNode(NextState, &NewTok, Parents)); - } -} - -namespace { -// A KeyedQueue yields pairs of keys and values in order of the keys. -template -using KeyedQueue = - std::priority_queue, - std::vector>, llvm::less_first>; - -template void sortAndUnique(std::vector &Vec) { - llvm::sort(Vec); - Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end()); -} - -// Perform reduces until no more are possible. -// -// Generally this means walking up from the heads gathering ForestNodes that -// will match the RHS of the rule we're reducing into a sequence ForestNode, -// and ending up at a base node. -// Then we push a new GSS node onto that base, taking care to: -// - pack alternative sequence ForestNodes into an ambiguous ForestNode. -// - use the same GSS node for multiple heads if the parse state matches. -// -// Examples of reduction: -// Before (simple): -// 0--1(expr)--2(semi) -// After reducing 2 by `stmt := expr semi`: -// 0--3(stmt) // 3 is goto(0, stmt) -// -// Before (splitting due to R/R conflict): -// 0--1(IDENTIFIER) -// After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`: -// 0--2(class-name) // 2 is goto(0, class-name) -// └--3(enum-name) // 3 is goto(0, enum-name) -// -// Before (splitting due to multiple bases): -// 0--2(class-name)--4(STAR) -// └--3(enum-name)---┘ -// After reducing 4 by `ptr-operator := STAR`: -// 0--2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator) -// └--3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator) -// -// Before (joining due to same goto state, multiple bases): -// 0--1(cv-qualifier)--3(class-name) -// └--2(cv-qualifier)--4(enum-name) -// After reducing 3 by `type-name := class-name` and -// 4 by `type-name := enum-name`: -// 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and -// └--2(cv-qualifier)--┘ // goto(2, type-name) -// -// Before (joining due to same goto state, the same base): -// 0--1(class-name)--3(STAR) -// └--2(enum-name)--4(STAR) -// After reducing 3 by `pointer := class-name STAR` and -// 2 by`enum-name := class-name STAR`: -// 0--5(pointer) // 5 is goto(0, pointer) -// -// (This is a functor rather than a function to allow it to reuse scratch -// storage across calls). -class GLRReduce { - const ParseParams &Params; - const Language& Lang; - // There are two interacting complications: - // 1. Performing one reduce can unlock new reduces on the newly-created head. - // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes). - // This means we must have unlocked all the reduces that contribute to it. - // 2b. Similarly, the new GSS nodes must be complete (have all parents). - // - // We define a "family" of reduces as those that produce the same symbol and - // cover the same range of tokens. These are exactly the set of reductions - // whose sequence nodes would be covered by the same ambiguous node. - // We wish to process a whole family at a time (to satisfy complication 2), - // and can address complication 1 by carefully ordering the families: - // - Process families covering fewer tokens first. - // A reduce can't depend on a longer reduce! - // - For equal token ranges: if S := T, process T families before S families. - // Parsing T can't depend on an equal-length S, as the grammar is acyclic. - // - // This isn't quite enough: we don't know the token length of the reduction - // until we walk up the stack to perform the pop. - // So we perform the pop part upfront, and place the push specification on - // priority queues such that we can retrieve a family at a time. - - // A reduction family is characterized by its token range and symbol produced. - // It is used as a key in the priority queues to group pushes by family. - struct Family { - // The start of the token range of the reduce. - Token::Index Start; - SymbolID Symbol; - // Rule must produce Symbol and can otherwise be arbitrary. - // RuleIDs have the topological order based on the acyclic grammar. - // FIXME: should SymbolIDs be so ordered instead? - RuleID Rule; - - bool operator==(const Family &Other) const { - return Start == Other.Start && Symbol == Other.Symbol; - } - // The larger Family is the one that should be processed first. - bool operator<(const Family &Other) const { - if (Start != Other.Start) - return Start < Other.Start; - if (Symbol != Other.Symbol) - return Rule > Other.Rule; - assert(*this == Other); - return false; - } - }; - - // A sequence is the ForestNode payloads of the GSS nodes we are reducing. - using Sequence = llvm::SmallVector; - // Like ArrayRef, but with the missing operator<. - // (Sequences are big to move by value as the collections gets rearranged). - struct SequenceRef { - SequenceRef(const Sequence &S) : S(S) {} - llvm::ArrayRef S; - friend bool operator==(SequenceRef A, SequenceRef B) { return A.S == B.S; } - friend bool operator<(const SequenceRef &A, const SequenceRef &B) { - return std::lexicographical_compare(A.S.begin(), A.S.end(), B.S.begin(), - B.S.end()); - } - }; - // Underlying storage for sequences pointed to by stored SequenceRefs. - std::deque SequenceStorage; - // We don't actually destroy the sequences between calls, to reuse storage. - // Everything SequenceStorage[ >=SequenceStorageCount ] is reusable scratch. - unsigned SequenceStorageCount; - - // Halfway through a reduction (after the pop, before the push), we have - // collected nodes for the RHS of a rule, and reached a base node. - // They specify a sequence ForestNode we may build (but we dedup first). - // (The RuleID is not stored here, but rather in the Family). - struct PushSpec { - // The last node popped before pushing. Its parent is the reduction base(s). - // (Base is more fundamental, but this is cheaper to store). - const GSS::Node* LastPop = nullptr; - Sequence *Seq = nullptr; - }; - KeyedQueue Sequences; // FIXME: rename => PendingPushes? - - // We treat Heads as a queue of Pop operations still to be performed. - // PoppedHeads is our position within it. - std::vector *Heads; - unsigned NextPopHead; - SymbolID Lookahead; - - Sequence TempSequence; -public: - GLRReduce(const ParseParams &Params, const Language &Lang) - : Params(Params), Lang(Lang) {} - - // Reduce Heads, resulting in new nodes that are appended to Heads. - // The "consumed" nodes are not removed! - // Only reduce rules compatible with the Lookahead are applied, though - // tokenSymbol(tok::unknown) will match any rule. - void operator()(std::vector &Heads, SymbolID Lookahead) { - assert(isToken(Lookahead)); - - NextPopHead = 0; - this->Heads = &Heads; - this->Lookahead = Lookahead; - assert(Sequences.empty()); - SequenceStorageCount = 0; - - popPending(); - while (!Sequences.empty()) { - pushNext(); - popPending(); - } - } - -private: - bool canReduce(const Rule &R, RuleID RID, - llvm::ArrayRef RHS) const { - if (!R.Guarded) - return true; - if (auto Guard = Lang.Guards.lookup(RID)) - return Guard({RHS, Params.Code, Lookahead}); - LLVM_DEBUG(llvm::dbgs() - << llvm::formatv("missing guard implementation for rule {0}\n", - Lang.G.dumpRule(RID))); - return true; - } - // pop walks up the parent chain(s) for a reduction from Head by to Rule. - // Once we reach the end, record the bases and sequences. - void pop(const GSS::Node *Head, RuleID RID, const Rule &Rule) { - LLVM_DEBUG(llvm::dbgs() << " Pop " << Lang.G.dumpRule(RID) << "\n"); - Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID}; - TempSequence.resize_for_overwrite(Rule.Size); - auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) { - TempSequence[Rule.Size - 1 - I] = N->Payload; - if (I + 1 == Rule.Size) { - F.Start = TempSequence.front()->startTokenIndex(); - LLVM_DEBUG({ - for (const auto *B : N->parents()) - llvm::dbgs() << " --> base at S" << B->State << "\n"; - }); - if (!canReduce(Rule, RID, TempSequence)) - return; - // Copy the chain to stable storage so it can be enqueued. - if (SequenceStorageCount == SequenceStorage.size()) - SequenceStorage.emplace_back(); - SequenceStorage[SequenceStorageCount] = TempSequence; - Sequence *Seq = &SequenceStorage[SequenceStorageCount++]; - - Sequences.emplace(F, PushSpec{N, Seq}); - return; - } - for (const GSS::Node *Parent : N->parents()) - DFS(Parent, I + 1, DFS); - }; - DFS(Head, 0, DFS); - } - - // popPending pops every available reduction. - void popPending() { - for (; NextPopHead < Heads->size(); ++NextPopHead) { - // In trivial cases, we perform the complete reduce here! - if (popAndPushTrivial()) - continue; - for (RuleID RID : - Lang.Table.getReduceRules((*Heads)[NextPopHead]->State)) { - const auto &Rule = Lang.G.lookupRule(RID); - if (Lang.Table.canFollow(Rule.Target, Lookahead)) - pop((*Heads)[NextPopHead], RID, Rule); - } - } - } - - // Storage reused by each call to pushNext. - std::vector> FamilyBases; - std::vector> FamilySequences; - std::vector Parents; - std::vector SequenceNodes; - - // Process one push family, forming a forest node. - // This produces new GSS heads which may enable more pops. - void pushNext() { - assert(!Sequences.empty()); - Family F = Sequences.top().first; - - LLVM_DEBUG(llvm::dbgs() << " Push " << Lang.G.symbolName(F.Symbol) - << " from token " << F.Start << "\n"); - - // Grab the sequences and bases for this family. - // We don't care which rule yielded each base. If Family.Symbol is S, the - // base includes an item X := ... • S ... and since the grammar is - // context-free, *all* parses of S are valid here. - FamilySequences.clear(); - FamilyBases.clear(); - do { - const PushSpec &Push = Sequences.top().second; - FamilySequences.emplace_back(Sequences.top().first.Rule, *Push.Seq); - for (const GSS::Node *Base : Push.LastPop->parents()) { - auto NextState = Lang.Table.getGoToState(Base->State, F.Symbol); - assert(NextState.has_value() && "goto must succeed after reduce!"); - FamilyBases.emplace_back(*NextState, Base); - } - - Sequences.pop(); - } while (!Sequences.empty() && Sequences.top().first == F); - // Build a forest node for each unique sequence. - sortAndUnique(FamilySequences); - SequenceNodes.clear(); - for (const auto &SequenceSpec : FamilySequences) - SequenceNodes.push_back(&Params.Forest.createSequence( - F.Symbol, SequenceSpec.first, SequenceSpec.second.S)); - // Wrap in an ambiguous node if needed. - const ForestNode *Parsed = - SequenceNodes.size() == 1 - ? SequenceNodes.front() - : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes); - LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Lang.G) << "\n"); - - // Bases for this family, deduplicate them, and group by the goTo State. - sortAndUnique(FamilyBases); - // Create a GSS node for each unique goto state. - llvm::ArrayRef BasesLeft = FamilyBases; - while (!BasesLeft.empty()) { - StateID NextState = BasesLeft.front().first; - Parents.clear(); - for (const auto &Base : BasesLeft) { - if (Base.first != NextState) - break; - Parents.push_back(Base.second); - } - BasesLeft = BasesLeft.drop_front(Parents.size()); - Heads->push_back(Params.GSStack.addNode(NextState, Parsed, Parents)); - } - } - - // In general we split a reduce into a pop/push, so concurrently-available - // reductions can run in the correct order. The data structures are expensive. - // - // When only one reduction is possible at a time, we can skip this: - // we pop and immediately push, as an LR parser (as opposed to GLR) would. - // This is valid whenever there's only one concurrent PushSpec. - // - // This function handles a trivial but common subset of these cases: - // - there must be no pending pushes, and only one poppable head - // - the head must have only one reduction rule - // - the reduction path must be a straight line (no multiple parents) - // (Roughly this means there's no local ambiguity, so the LR algorithm works). - // - // Returns true if we successfully consumed the next unpopped head. - bool popAndPushTrivial() { - if (!Sequences.empty() || Heads->size() != NextPopHead + 1) - return false; - const GSS::Node *Head = Heads->back(); - std::optional RID; - for (RuleID R : Lang.Table.getReduceRules(Head->State)) { - if (RID.has_value()) - return false; - RID = R; - } - if (!RID) - return true; // no reductions available, but we've processed the head! - const auto &Rule = Lang.G.lookupRule(*RID); - if (!Lang.Table.canFollow(Rule.Target, Lookahead)) - return true; // reduction is not available - const GSS::Node *Base = Head; - TempSequence.resize_for_overwrite(Rule.Size); - for (unsigned I = 0; I < Rule.Size; ++I) { - if (Base->parents().size() != 1) - return false; - TempSequence[Rule.Size - 1 - I] = Base->Payload; - Base = Base->parents().front(); - } - if (!canReduce(Rule, *RID, TempSequence)) - return true; // reduction is not available - const ForestNode *Parsed = - &Params.Forest.createSequence(Rule.Target, *RID, TempSequence); - auto NextState = Lang.Table.getGoToState(Base->State, Rule.Target); - assert(NextState.has_value() && "goto must succeed after reduce!"); - Heads->push_back(Params.GSStack.addNode(*NextState, Parsed, {Base})); - LLVM_DEBUG(llvm::dbgs() - << " Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n" - << " --> S" << Heads->back()->State << "\n"); - return true; - } -}; - -} // namespace - -ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, - const Language &Lang) { - GLRReduce Reduce(Params, Lang); - assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal"); - llvm::ArrayRef Terminals = Params.Forest.createTerminals(Params.Code); - auto &GSS = Params.GSStack; - - StateID StartState = Lang.Table.getStartState(StartSymbol); - // Heads correspond to the parse of tokens [0, I), NextHeads to [0, I+1). - std::vector Heads = {GSS.addNode(/*State=*/StartState, - /*ForestNode=*/nullptr, - {})}; - // Invariant: Heads is partitioned by source: {shifted | reduced}. - // HeadsPartition is the index of the first head formed by reduction. - // We use this to discard and recreate the reduced heads during recovery. - unsigned HeadsPartition = Heads.size(); - std::vector NextHeads; - auto MaybeGC = [&, Roots(std::vector{}), I(0u)]() mutable { - assert(NextHeads.empty() && "Running GC at the wrong time!"); - if (++I != 20) // Run periodically to balance CPU and memory usage. - return; - I = 0; - - // We need to copy the list: Roots is consumed by the GC. - Roots = Heads; - GSS.gc(std::move(Roots)); - }; - // Each iteration fully processes a single token. - for (unsigned I = 0; I < Terminals.size();) { - LLVM_DEBUG(llvm::dbgs() << llvm::formatv( - "Next token {0} (id={1})\n", - Lang.G.symbolName(Terminals[I].symbol()), Terminals[I].symbol())); - // Consume the token. - glrShift(Heads, Terminals[I], Params, Lang, NextHeads); - - // If we weren't able to consume the token, try to skip over some tokens - // so we can keep parsing. - if (NextHeads.empty()) { - // The reduction in the previous round was constrained by lookahead. - // On valid code this only rejects dead ends, but on broken code we should - // consider all possibilities. - // - // We discard all heads formed by reduction, and recreate them without - // this constraint. This may duplicate some nodes, but it's rare. - LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. " - "Re-reducing without lookahead.\n"); - Heads.resize(HeadsPartition); - Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown)); - - glrRecover(Heads, I, Params, Lang, NextHeads); - if (NextHeads.empty()) - // FIXME: Ensure the `_ := start-symbol` rules have a fallback - // error-recovery strategy attached. Then this condition can't happen. - return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); - } else - ++I; - - // Form nonterminals containing the token we just consumed. - SymbolID Lookahead = - I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol(); - HeadsPartition = NextHeads.size(); - Reduce(NextHeads, Lookahead); - // Prepare for the next token. - std::swap(Heads, NextHeads); - NextHeads.clear(); - MaybeGC(); - } - LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n")); - - // The parse was successful if in state `_ := start-symbol EOF .` - // The GSS parent has `_ := start-symbol . EOF`; its payload is the parse. - auto AfterStart = Lang.Table.getGoToState(StartState, StartSymbol); - assert(AfterStart.has_value() && "goto must succeed after start symbol!"); - auto Accept = Lang.Table.getShiftState(*AfterStart, tokenSymbol(tok::eof)); - assert(Accept.has_value() && "shift EOF must succeed!"); - auto SearchForAccept = [&](llvm::ArrayRef Heads) { - const ForestNode *Result = nullptr; - for (const auto *Head : Heads) { - if (Head->State == *Accept) { - assert(Head->Payload->symbol() == tokenSymbol(tok::eof)); - assert(Result == nullptr && "multiple results!"); - Result = Head->parents().front()->Payload; - assert(Result->symbol() == StartSymbol); - } - } - return Result; - }; - if (auto *Result = SearchForAccept(Heads)) - return *const_cast(Result); // Safe: we created all nodes. - // We failed to parse the input, returning an opaque forest node for recovery. - // FIXME: as above, we can add fallback error handling so this is impossible. - return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); -} - -void glrReduce(std::vector &Heads, SymbolID Lookahead, - const ParseParams &Params, const Language &Lang) { - // Create a new GLRReduce each time for tests, performance doesn't matter. - GLRReduce{Params, Lang}(Heads, Lookahead); -} - -const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol, - llvm::ArrayRef Parents) { - Node *Result = new (allocate(Parents.size())) Node(); - Result->State = State; - Result->GCParity = GCParity; - Result->ParentCount = Parents.size(); - Alive.push_back(Result); - ++NodesCreated; - Result->Payload = Symbol; - if (!Parents.empty()) - llvm::copy(Parents, reinterpret_cast(Result + 1)); - return Result; -} - -GSS::Node *GSS::allocate(unsigned Parents) { - if (FreeList.size() <= Parents) - FreeList.resize(Parents + 1); - auto &SizedList = FreeList[Parents]; - if (!SizedList.empty()) { - auto *Result = SizedList.back(); - SizedList.pop_back(); - return Result; - } - return static_cast( - Arena.Allocate(sizeof(Node) + Parents * sizeof(Node *), alignof(Node))); -} - -void GSS::destroy(Node *N) { - unsigned ParentCount = N->ParentCount; - N->~Node(); - assert(FreeList.size() > ParentCount && "established on construction!"); - FreeList[ParentCount].push_back(N); -} - -unsigned GSS::gc(std::vector &&Queue) { -#ifndef NDEBUG - auto ParityMatches = [&](const Node *N) { return N->GCParity == GCParity; }; - assert("Before GC" && llvm::all_of(Alive, ParityMatches)); - auto Deferred = llvm::make_scope_exit( - [&] { assert("After GC" && llvm::all_of(Alive, ParityMatches)); }); - assert(llvm::all_of( - Queue, [&](const Node *R) { return llvm::is_contained(Alive, R); })); -#endif - unsigned InitialCount = Alive.size(); - - // Mark - GCParity = !GCParity; - while (!Queue.empty()) { - Node *N = const_cast(Queue.back()); // Safe: we created these nodes. - Queue.pop_back(); - if (N->GCParity != GCParity) { // Not seen yet - N->GCParity = GCParity; // Mark as seen - for (const Node *P : N->parents()) // And walk parents - Queue.push_back(P); - } - } - // Sweep - llvm::erase_if(Alive, [&](Node *N) { - if (N->GCParity == GCParity) // Walk reached this node. - return false; - destroy(N); - return true; - }); - - LLVM_DEBUG(llvm::dbgs() << "GC pruned " << (InitialCount - Alive.size()) - << "/" << InitialCount << " GSS nodes\n"); - return InitialCount - Alive.size(); -} - -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/cli/CLI.cpp b/clang-tools-extra/pseudo/lib/cli/CLI.cpp deleted file mode 100644 index 5c7c3b6c827ea..0000000000000 --- a/clang-tools-extra/pseudo/lib/cli/CLI.cpp +++ /dev/null @@ -1,54 +0,0 @@ -//===--- CLI.cpp - ----------------------------------------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/cli/CLI.h" -#include "clang-pseudo/cxx/CXX.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ErrorOr.h" -#include "llvm/Support/MemoryBuffer.h" - -static llvm::cl::opt Grammar( - "grammar", - llvm::cl::desc( - "Specify a BNF grammar file path, or a builtin language (cxx)."), - llvm::cl::init("cxx")); - -namespace clang { -namespace pseudo { - -const Language &getLanguageFromFlags() { - if (::Grammar == "cxx") - return cxx::getLanguage(); - - static Language *Lang = []() { - // Read from a bnf grammar file. - llvm::ErrorOr> GrammarText = - llvm::MemoryBuffer::getFile(::Grammar); - if (std::error_code EC = GrammarText.getError()) { - llvm::errs() << "Error: can't read grammar file '" << ::Grammar - << "': " << EC.message() << "\n"; - std::exit(1); - } - std::vector Diags; - auto G = Grammar::parseBNF(GrammarText->get()->getBuffer(), Diags); - for (const auto &Diag : Diags) - llvm::errs() << Diag << "\n"; - auto Table = LRTable::buildSLR(G); - return new Language{ - std::move(G), - std::move(Table), - llvm::DenseMap(), - llvm::DenseMap(), - }; - }(); - return *Lang; -} - -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt deleted file mode 100644 index 68e644f62fded..0000000000000 --- a/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - -add_clang_library(clangPseudoCLI - CLI.cpp - - # FIXME export the headers from clangPseudoCXX instead - DEPENDS - cxx_gen - - LINK_LIBS - clangPseudoGrammar - clangPseudoCXX - ) diff --git a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt deleted file mode 100644 index d56d16c893c3d..0000000000000 --- a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - -add_clang_library(clangPseudoCXX - CXX.cpp - - DEPENDS - cxx_gen - - LINK_LIBS - clangBasic - clangPseudo - clangPseudoGrammar - ) diff --git a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp deleted file mode 100644 index 4188dab31d3a9..0000000000000 --- a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp +++ /dev/null @@ -1,452 +0,0 @@ -//===--- CXX.cpp - Define public interfaces for C++ grammar ---------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/cxx/CXX.h" -#include "clang-pseudo/Forest.h" -#include "clang-pseudo/Language.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "clang-pseudo/grammar/LRTable.h" -#include "clang/Basic/CharInfo.h" -#include "clang/Basic/TokenKinds.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/Support/Debug.h" -#include -#define DEBUG_TYPE "CXX.cpp" - -namespace clang { -namespace pseudo { -namespace cxx { -namespace { -static const char *CXXBNF = -#include "CXXBNF.inc" - ; - -// User-defined string literals look like `""suffix`. -bool isStringUserDefined(const Token &Tok) { - return !Tok.text().ends_with("\""); -} -bool isCharUserDefined(const Token &Tok) { return !Tok.text().ends_with("'"); } - -// Combinable flags describing numbers. -// Clang has just one numeric_token kind, the grammar has 4. -enum NumericKind { - Integer = 0, - Floating = 1 << 0, - UserDefined = 1 << 1, -}; -// Determine the kind of numeric_constant we have. -// We can assume it's something valid, as it has been lexed. -// FIXME: is this expensive enough that we should set flags on the token -// and reuse them rather than computing it for each guard? -unsigned numKind(const Token &Tok) { - assert(Tok.Kind == tok::numeric_constant); - llvm::StringRef Text = Tok.text(); - if (Text.size() <= 1) - return Integer; - bool Hex = - Text.size() > 2 && Text[0] == '0' && (Text[1] == 'x' || Text[1] == 'X'); - uint8_t K = Integer; - - for (char C : Text) { - switch (C) { - case '.': - K |= Floating; - break; - case 'e': - case 'E': - if (!Hex) - K |= Floating; - break; - case 'p': - case 'P': - if (Hex) - K |= Floating; - break; - case '_': - K |= UserDefined; - break; - default: - break; - } - } - - // We would be done here, but there are stdlib UDLs that lack _. - // We must distinguish these from the builtin suffixes. - unsigned LastLetter = Text.size(); - while (LastLetter > 0 && isLetter(Text[LastLetter - 1])) - --LastLetter; - if (LastLetter == Text.size()) // Common case - return NumericKind(K); - // Trailing d/e/f are not part of the suffix in hex numbers. - while (Hex && LastLetter < Text.size() && isHexDigit(Text[LastLetter])) - ++LastLetter; - return llvm::StringSwitch(Text.substr(LastLetter)) - // std::chrono - .Cases("h", "min", "s", "ms", "us", "ns", "d", "y", K | UserDefined) - // complex - .Cases("il", "i", "if", K | UserDefined) - .Default(K); -} - -// RHS is expected to contain a single terminal. -// Returns the corresponding token. -const Token &onlyToken(tok::TokenKind Kind, - const ArrayRef RHS, - const TokenStream &Tokens) { - assert(RHS.size() == 1 && RHS.front()->symbol() == tokenSymbol(Kind)); - return Tokens.tokens()[RHS.front()->startTokenIndex()]; -} -// RHS is expected to contain a single symbol. -// Returns the corresponding ForestNode. -const ForestNode &onlySymbol(SymbolID Kind, - const ArrayRef RHS, - const TokenStream &Tokens) { - assert(RHS.size() == 1 && RHS.front()->symbol() == Kind); - return *RHS.front(); -} - -bool isFunctionDeclarator(const ForestNode *Declarator) { - assert(Declarator->symbol() == cxx::Symbol::declarator); - bool IsFunction = false; - while (true) { - // not well-formed code, return the best guess. - if (Declarator->kind() != ForestNode::Sequence) - return IsFunction; - - switch (Declarator->rule()) { - case rule::noptr_declarator::declarator_id: // reached the bottom - return IsFunction; - // *X is a nonfunction (unless X is a function). - case rule::ptr_declarator::ptr_operator__ptr_declarator: - Declarator = Declarator->elements()[1]; - IsFunction = false; - continue; - // X() is a function (unless X is a pointer or similar). - case rule::declarator:: - noptr_declarator__parameters_and_qualifiers__trailing_return_type: - case rule::noptr_declarator::noptr_declarator__parameters_and_qualifiers: - Declarator = Declarator->elements()[0]; - IsFunction = true; - continue; - // X[] is an array (unless X is a pointer or function). - case rule::noptr_declarator:: - noptr_declarator__L_SQUARE__constant_expression__R_SQUARE: - case rule::noptr_declarator::noptr_declarator__L_SQUARE__R_SQUARE: - Declarator = Declarator->elements()[0]; - IsFunction = false; - continue; - // (X) is whatever X is. - case rule::noptr_declarator::L_PAREN__ptr_declarator__R_PAREN: - Declarator = Declarator->elements()[1]; - continue; - case rule::ptr_declarator::noptr_declarator: - case rule::declarator::ptr_declarator: - Declarator = Declarator->elements()[0]; - continue; - - default: - assert(false && "unhandled declarator for IsFunction"); - return IsFunction; - } - } - llvm_unreachable("unreachable"); -} - -bool guardNextTokenNotElse(const GuardParams &P) { - return symbolToToken(P.Lookahead) != tok::kw_else; -} - -bool specifiesStructuredBinding(const GuardParams &P) { - const auto DSS = P.RHS[0]; - assert(DSS->symbol() == Symbol::decl_specifier_seq); - - auto Length = P.RHS[1]->startTokenIndex() - DSS->startTokenIndex(); - for (const auto &T : - P.Tokens.tokens().slice(DSS->startTokenIndex(), Length)) { - switch (T.Kind) { - case clang::tok::kw_static: - case clang::tok::kw_thread_local: - case clang::tok::kw_auto: - case clang::tok::kw_const: - case clang::tok::kw_volatile: - break; - default: - return false; - } - } - return true; -} - -// Whether this e.g. decl-specifier contains an "exclusive" type such as a class -// name, and thus can't combine with a second exclusive type. -// -// Returns false for -// - non-types -// - "unsigned" etc that may suffice as types but may modify others -// - cases of uncertainty (e.g. due to ambiguity) -bool hasExclusiveType(const ForestNode *N) { - // FIXME: every time we apply this check, we walk the whole subtree. - // Add per-node caching instead. - while (true) { - assert(N->symbol() == Symbol::decl_specifier_seq || - N->symbol() == Symbol::type_specifier_seq || - N->symbol() == Symbol::defining_type_specifier_seq || - N->symbol() == Symbol::decl_specifier || - N->symbol() == Symbol::type_specifier || - N->symbol() == Symbol::defining_type_specifier || - N->symbol() == Symbol::simple_type_specifier); - if (N->kind() == ForestNode::Opaque) - return false; // conservative - if (N->kind() == ForestNode::Ambiguous) - return llvm::all_of(N->alternatives(), hasExclusiveType); // conservative - // All supported symbols are nonterminals. - assert(N->kind() == ForestNode::Sequence); - switch (N->rule()) { - // seq := element seq: check element then continue into seq - case rule::decl_specifier_seq::decl_specifier__decl_specifier_seq: - case rule::defining_type_specifier_seq::defining_type_specifier__defining_type_specifier_seq: - case rule::type_specifier_seq::type_specifier__type_specifier_seq: - if (hasExclusiveType(N->children()[0])) - return true; - N = N->children()[1]; - continue; - // seq := element: continue into element - case rule::decl_specifier_seq::decl_specifier: - case rule::type_specifier_seq::type_specifier: - case rule::defining_type_specifier_seq::defining_type_specifier: - N = N->children()[0]; - continue; - - // defining-type-specifier - case rule::defining_type_specifier::type_specifier: - N = N->children()[0]; - continue; - case rule::defining_type_specifier::class_specifier: - case rule::defining_type_specifier::enum_specifier: - return true; - - // decl-specifier - case rule::decl_specifier::defining_type_specifier: - N = N->children()[0]; - continue; - case rule::decl_specifier::CONSTEVAL: - case rule::decl_specifier::CONSTEXPR: - case rule::decl_specifier::CONSTINIT: - case rule::decl_specifier::INLINE: - case rule::decl_specifier::FRIEND: - case rule::decl_specifier::storage_class_specifier: - case rule::decl_specifier::TYPEDEF: - case rule::decl_specifier::function_specifier: - return false; - - // type-specifier - case rule::type_specifier::elaborated_type_specifier: - case rule::type_specifier::typename_specifier: - return true; - case rule::type_specifier::simple_type_specifier: - N = N->children()[0]; - continue; - case rule::type_specifier::cv_qualifier: - return false; - - // simple-type-specifier - case rule::simple_type_specifier::type_name: - case rule::simple_type_specifier::template_name: - case rule::simple_type_specifier::builtin_type: - case rule::simple_type_specifier::nested_name_specifier__TEMPLATE__simple_template_id: - case rule::simple_type_specifier::nested_name_specifier__template_name: - case rule::simple_type_specifier::nested_name_specifier__type_name: - case rule::simple_type_specifier::decltype_specifier: - case rule::simple_type_specifier::placeholder_type_specifier: - return true; - case rule::simple_type_specifier::LONG: - case rule::simple_type_specifier::SHORT: - case rule::simple_type_specifier::SIGNED: - case rule::simple_type_specifier::UNSIGNED: - return false; - - default: - LLVM_DEBUG(llvm::errs() << "Unhandled rule " << N->rule() << "\n"); - llvm_unreachable("hasExclusiveType be exhaustive!"); - } - } -} - -llvm::DenseMap buildGuards() { -#define GUARD(cond) \ - { \ - [](const GuardParams &P) { return cond; } \ - } -#define TOKEN_GUARD(kind, cond) \ - [](const GuardParams& P) { \ - const Token &Tok = onlyToken(tok::kind, P.RHS, P.Tokens); \ - return cond; \ - } -#define SYMBOL_GUARD(kind, cond) \ - [](const GuardParams& P) { \ - const ForestNode &N = onlySymbol(Symbol::kind, P.RHS, P.Tokens); \ - return cond; \ - } - return { - {rule::function_declarator::declarator, - SYMBOL_GUARD(declarator, isFunctionDeclarator(&N))}, - {rule::non_function_declarator::declarator, - SYMBOL_GUARD(declarator, !isFunctionDeclarator(&N))}, - - // A {decl,type,defining-type}-specifier-sequence cannot have multiple - // "exclusive" types (like class names): a value has only one type. - {rule::defining_type_specifier_seq:: - defining_type_specifier__defining_type_specifier_seq, - GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, - {rule::type_specifier_seq::type_specifier__type_specifier_seq, - GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, - {rule::decl_specifier_seq::decl_specifier__decl_specifier_seq, - GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, - - {rule::contextual_override::IDENTIFIER, - TOKEN_GUARD(identifier, Tok.text() == "override")}, - {rule::contextual_final::IDENTIFIER, - TOKEN_GUARD(identifier, Tok.text() == "final")}, - {rule::import_keyword::IDENTIFIER, - TOKEN_GUARD(identifier, Tok.text() == "import")}, - {rule::export_keyword::IDENTIFIER, - TOKEN_GUARD(identifier, Tok.text() == "export")}, - {rule::module_keyword::IDENTIFIER, - TOKEN_GUARD(identifier, Tok.text() == "module")}, - {rule::contextual_zero::NUMERIC_CONSTANT, - TOKEN_GUARD(numeric_constant, Tok.text() == "0")}, - - {rule::selection_statement::IF__L_PAREN__condition__R_PAREN__statement, - guardNextTokenNotElse}, - {rule::selection_statement:: - IF__L_PAREN__init_statement__condition__R_PAREN__statement, - guardNextTokenNotElse}, - {rule::selection_statement:: - IF__CONSTEXPR__L_PAREN__condition__R_PAREN__statement, - guardNextTokenNotElse}, - {rule::selection_statement:: - IF__CONSTEXPR__L_PAREN__init_statement__condition__R_PAREN__statement, - guardNextTokenNotElse}, - - // Implement C++ [basic.lookup.qual.general]: - // If a name, template-id, or decltype-specifier is followed by a - // ​::​, it shall designate a namespace, class, enumeration, or - // dependent type, and the ​::​ is never interpreted as a complete - // nested-name-specifier. - {rule::nested_name_specifier::COLONCOLON, - TOKEN_GUARD(coloncolon, Tok.prev().Kind != tok::identifier)}, - - // Implement C++ [dcl.pre#6]: - // A simple-declaration with an identifier-list is called a structured - // binding declaration ([dcl.struct.bind]). If the decl-specifier-seq - // contains any decl-specifier other than static, thread_­local, auto, - // or cv-qualifiers, the program is ill-formed. - {rule::simple_declaration:: - decl_specifier_seq__ref_qualifier__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI, - specifiesStructuredBinding}, - {rule::simple_declaration:: - decl_specifier_seq__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI, - specifiesStructuredBinding}, - - // The grammar distinguishes (only) user-defined vs plain string literals, - // where the clang lexer distinguishes (only) encoding types. - {rule::user_defined_string_literal_chunk::STRING_LITERAL, - TOKEN_GUARD(string_literal, isStringUserDefined(Tok))}, - {rule::user_defined_string_literal_chunk::UTF8_STRING_LITERAL, - TOKEN_GUARD(utf8_string_literal, isStringUserDefined(Tok))}, - {rule::user_defined_string_literal_chunk::UTF16_STRING_LITERAL, - TOKEN_GUARD(utf16_string_literal, isStringUserDefined(Tok))}, - {rule::user_defined_string_literal_chunk::UTF32_STRING_LITERAL, - TOKEN_GUARD(utf32_string_literal, isStringUserDefined(Tok))}, - {rule::user_defined_string_literal_chunk::WIDE_STRING_LITERAL, - TOKEN_GUARD(wide_string_literal, isStringUserDefined(Tok))}, - {rule::string_literal_chunk::STRING_LITERAL, - TOKEN_GUARD(string_literal, !isStringUserDefined(Tok))}, - {rule::string_literal_chunk::UTF8_STRING_LITERAL, - TOKEN_GUARD(utf8_string_literal, !isStringUserDefined(Tok))}, - {rule::string_literal_chunk::UTF16_STRING_LITERAL, - TOKEN_GUARD(utf16_string_literal, !isStringUserDefined(Tok))}, - {rule::string_literal_chunk::UTF32_STRING_LITERAL, - TOKEN_GUARD(utf32_string_literal, !isStringUserDefined(Tok))}, - {rule::string_literal_chunk::WIDE_STRING_LITERAL, - TOKEN_GUARD(wide_string_literal, !isStringUserDefined(Tok))}, - // And the same for chars. - {rule::user_defined_character_literal::CHAR_CONSTANT, - TOKEN_GUARD(char_constant, isCharUserDefined(Tok))}, - {rule::user_defined_character_literal::UTF8_CHAR_CONSTANT, - TOKEN_GUARD(utf8_char_constant, isCharUserDefined(Tok))}, - {rule::user_defined_character_literal::UTF16_CHAR_CONSTANT, - TOKEN_GUARD(utf16_char_constant, isCharUserDefined(Tok))}, - {rule::user_defined_character_literal::UTF32_CHAR_CONSTANT, - TOKEN_GUARD(utf32_char_constant, isCharUserDefined(Tok))}, - {rule::user_defined_character_literal::WIDE_CHAR_CONSTANT, - TOKEN_GUARD(wide_char_constant, isCharUserDefined(Tok))}, - {rule::character_literal::CHAR_CONSTANT, - TOKEN_GUARD(char_constant, !isCharUserDefined(Tok))}, - {rule::character_literal::UTF8_CHAR_CONSTANT, - TOKEN_GUARD(utf8_char_constant, !isCharUserDefined(Tok))}, - {rule::character_literal::UTF16_CHAR_CONSTANT, - TOKEN_GUARD(utf16_char_constant, !isCharUserDefined(Tok))}, - {rule::character_literal::UTF32_CHAR_CONSTANT, - TOKEN_GUARD(utf32_char_constant, !isCharUserDefined(Tok))}, - {rule::character_literal::WIDE_CHAR_CONSTANT, - TOKEN_GUARD(wide_char_constant, !isCharUserDefined(Tok))}, - // clang just has one NUMERIC_CONSTANT token for {ud,plain}x{float,int} - {rule::user_defined_integer_literal::NUMERIC_CONSTANT, - TOKEN_GUARD(numeric_constant, numKind(Tok) == (Integer | UserDefined))}, - {rule::user_defined_floating_point_literal::NUMERIC_CONSTANT, - TOKEN_GUARD(numeric_constant, numKind(Tok) == (Floating | UserDefined))}, - {rule::integer_literal::NUMERIC_CONSTANT, - TOKEN_GUARD(numeric_constant, numKind(Tok) == Integer)}, - {rule::floating_point_literal::NUMERIC_CONSTANT, - TOKEN_GUARD(numeric_constant, numKind(Tok) == Floating)}, - }; -#undef TOKEN_GUARD -#undef SYMBOL_GUARD -} - -Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) { - assert(Begin > 0); - const Token &Left = Tokens.tokens()[Begin - 1]; - assert(Left.Kind == tok::l_brace || Left.Kind == tok::l_paren || - Left.Kind == tok::l_square); - if (const Token *Right = Left.pair()) { - assert(Tokens.index(*Right) > Begin - 1); - return Tokens.index(*Right); - } - return Token::Invalid; -} - -llvm::DenseMap buildRecoveryStrategies() { - return { - {Extension::Brackets, recoverBrackets}, - }; -} - -} // namespace - -const Language &getLanguage() { - static const auto &CXXLanguage = []() -> const Language & { - std::vector Diags; - auto G = Grammar::parseBNF(CXXBNF, Diags); - assert(Diags.empty()); - LRTable Table = LRTable::buildSLR(G); - const Language *PL = new Language{ - std::move(G), - std::move(Table), - buildGuards(), - buildRecoveryStrategies(), - }; - return *PL; - }(); - return CXXLanguage; -} - -} // namespace cxx -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf deleted file mode 100644 index 36caf7b1e6337..0000000000000 --- a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf +++ /dev/null @@ -1,776 +0,0 @@ -# This is a C++ grammar from the C++ standard [1]. -# -# The grammar is a superset of the true grammar requring semantic constraints to -# resolve ambiguities. The grammar is context-free and ambiguous (beyond the -# limit of LR(k)). We use general parsing algorithm (e.g GLR) to handle the -# grammar and generate a transition table which is used to drive the parsing. -# -# It aims to align with the ISO C++ grammar as much as possible. We adjust it -# to fit the need for the grammar-based parser: -# - attributes are omitted, which will be handled as comments; -# - we don't allow nullable nonterminal symbols. There are few nullable -# nonterminals in the spec grammar, they are adjusted to be non-nullable; -# - the file merely describes the core C++ grammar. Preprocessor directives and -# lexical conversions are omitted as we reuse clang's lexer and run a fake -# preprocessor; -# - grammar rules with the >> token are adjusted, the greatergreater token is -# split into two > tokens, to make the GLR parser aware of nested templates -# and right shift operator; -# -# Guidelines: -# - nonterminals are lower_case; terminals (aka tokens) correspond to -# clang::TokenKind, written as "IDENTIFIER", "USING", "::" etc; -# - optional symbols are supported, with a _opt suffix; -# -# [1] https://isocpp.org/files/papers/N4860.pdf - -# _ lists all the start-symbols which we support parsing. -# -# We list important nonterminals as start symbols, rather than doing it for all -# nonterminals by default, this reduces the number of states by 30% and LRTable -# actions by 16%. -_ := translation-unit EOF -_ := statement-seq EOF -_ := declaration-seq EOF - -# gram.key -#! we don't distinguish between namespaces and namespace aliases, as it's hard -#! and uninteresting. -namespace-name := IDENTIFIER -template-name := IDENTIFIER - -# gram.basic -#! Custom modifications to eliminate optional declaration-seq -translation-unit := declaration-seq -translation-unit := global-module-fragment_opt module-declaration declaration-seq_opt private-module-fragment_opt - -# gram.expr -# expr.prim -primary-expression := literal -primary-expression := THIS -primary-expression := ( expression ) -primary-expression := id-expression -primary-expression := lambda-expression -primary-expression := fold-expression -primary-expression := requires-expression -id-expression := unqualified-id -id-expression := qualified-id -unqualified-id := IDENTIFIER -unqualified-id := operator-function-id -unqualified-id := conversion-function-id -unqualified-id := literal-operator-id -unqualified-id := ~ type-name -unqualified-id := ~ decltype-specifier -unqualified-id := template-id -qualified-id := nested-name-specifier TEMPLATE_opt unqualified-id -nested-name-specifier := :: [guard] -nested-name-specifier := type-name :: -nested-name-specifier := namespace-name :: -nested-name-specifier := decltype-specifier :: -nested-name-specifier := nested-name-specifier IDENTIFIER :: -nested-name-specifier := nested-name-specifier TEMPLATE_opt simple-template-id :: -lambda-expression := lambda-introducer lambda-declarator_opt compound-statement -lambda-expression := lambda-introducer < template-parameter-list > requires-clause_opt lambda-declarator_opt compound-statement -#! We allow a capture-default to appear anywhere in a capture-list. -# This simplifies the grammar and error recovery. -lambda-introducer := [ capture-list_opt ] -lambda-declarator := ( parameter-declaration-clause_opt ) decl-specifier-seq_opt noexcept-specifier_opt trailing-return-type_opt requires-clause_opt -capture-list := capture -capture-list := capture-list , capture -capture := capture-default -capture := simple-capture -capture := init-capture -capture-default := & -capture-default := = -simple-capture := IDENTIFIER ..._opt -simple-capture := & IDENTIFIER ..._opt -simple-capture := THIS -simple-capture := * THIS -init-capture := ..._opt IDENTIFIER initializer -init-capture := & ..._opt IDENTIFIER initializer -fold-expression := ( cast-expression fold-operator ... ) -fold-expression := ( ... fold-operator cast-expression ) -fold-expression := ( cast-expression fold-operator ... fold-operator cast-expression ) -fold-operator := + -fold-operator := - -fold-operator := * -fold-operator := / -fold-operator := % -fold-operator := ^ -fold-operator := | -fold-operator := << -fold-operator := greatergreater -fold-operator := += -fold-operator := -= -fold-operator := *= -fold-operator := /= -fold-operator := %= -fold-operator := ^= -fold-operator := &= -fold-operator := |= -fold-operator := <<= -fold-operator := >>= -fold-operator := = -fold-operator := == -fold-operator := != -fold-operator := < -fold-operator := > -fold-operator := <= -fold-operator := >= -fold-operator := && -fold-operator := || -fold-operator := , -fold-operator := .* -fold-operator := ->* -requires-expression := REQUIRES requirement-parameter-list_opt requirement-body -requirement-parameter-list := ( parameter-declaration-clause_opt ) -requirement-body := { requirement-seq } -requirement-seq := requirement -requirement-seq := requirement-seq requirement -requirement := simple-requirement -requirement := type-requirement -requirement := compound-requirement -requirement := nested-requirement -simple-requirement := expression ; -type-requirement := TYPENAME nested-name-specifier_opt type-name ; -compound-requirement := { expression } NOEXCEPT_opt return-type-requirement_opt ; -return-type-requirement := -> type-constraint -nested-requirement := REQUIRES constraint-expression ; -# expr.post -postfix-expression := primary-expression -postfix-expression := postfix-expression [ expr-or-braced-init-list ] -postfix-expression := postfix-expression ( expression-list_opt ) -postfix-expression := simple-type-specifier ( expression-list_opt ) -postfix-expression := typename-specifier ( expression-list_opt ) -postfix-expression := simple-type-specifier braced-init-list -postfix-expression := postfix-expression . TEMPLATE_opt id-expression -postfix-expression := postfix-expression -> TEMPLATE_opt id-expression -postfix-expression := postfix-expression ++ -postfix-expression := postfix-expression -- -postfix-expression := DYNAMIC_CAST < type-id > ( expression ) -postfix-expression := STATIC_CAST < type-id > ( expression ) -postfix-expression := REINTERPRET_CAST < type-id > ( expression ) -postfix-expression := CONST_CAST < type-id > ( expression ) -postfix-expression := TYPEID ( expression ) -postfix-expression := TYPEID ( type-id ) -#! Standard defines expression-list in terms of initializer-list, but our -# initializer-list allows designators. -expression-list := initializer-clause ..._opt -expression-list := expression-list , initializer-clause ..._opt -# expr.unary -unary-expression := postfix-expression -unary-expression := unary-operator cast-expression -unary-expression := ++ cast-expression -unary-expression := -- cast-expression -unary-expression := await-expression -unary-expression := SIZEOF unary-expression -unary-expression := SIZEOF ( type-id ) -unary-expression := SIZEOF ... ( IDENTIFIER ) -unary-expression := ALIGNOF ( type-id ) -unary-expression := noexcept-expression -unary-expression := new-expression -unary-expression := delete-expression -unary-operator := * -unary-operator := & -unary-operator := + -unary-operator := - -unary-operator := ! -unary-operator := ~ -await-expression := CO_AWAIT cast-expression -noexcept-expression := NOEXCEPT ( expression ) -new-expression := ::_opt NEW new-placement_opt new-type-id new-initializer_opt -new-expression := ::_opt NEW new-placement_opt ( type-id ) new-initializer_opt -new-placement := ( expression-list ) -new-type-id := type-specifier-seq new-declarator_opt -new-declarator := ptr-operator new-declarator_opt -new-declarator := noptr-new-declarator -noptr-new-declarator := [ expression_opt ] -noptr-new-declarator := noptr-new-declarator [ constant-expression ] -new-initializer := ( expression-list_opt ) -new-initializer := braced-init-list -delete-expression := ::_opt DELETE cast-expression -delete-expression := ::_opt DELETE [ ] cast-expression -cast-expression := unary-expression -cast-expression := ( type-id ) cast-expression -# expr.mptr.oper -pm-expression := cast-expression -pm-expression := pm-expression .* cast-expression -pm-expression := pm-expression ->* cast-expression -# expr.mul -multiplicative-expression := pm-expression -multiplicative-expression := multiplicative-expression * pm-expression -multiplicative-expression := multiplicative-expression / pm-expression -multiplicative-expression := multiplicative-expression % pm-expression -# expr.add -additive-expression := multiplicative-expression -additive-expression := additive-expression + multiplicative-expression -additive-expression := additive-expression - multiplicative-expression -# expr.shift -shift-expression := additive-expression -shift-expression := shift-expression << additive-expression -shift-expression := shift-expression greatergreater additive-expression -# expr.spaceship -compare-expression := shift-expression -compare-expression := compare-expression <=> shift-expression -# expr.rel -relational-expression := compare-expression -relational-expression := relational-expression < compare-expression -relational-expression := relational-expression > compare-expression -relational-expression := relational-expression <= compare-expression -relational-expression := relational-expression >= compare-expression -# expr.eq -equality-expression := relational-expression -equality-expression := equality-expression == relational-expression -equality-expression := equality-expression != relational-expression -# expr.bit.and -and-expression := equality-expression -and-expression := and-expression & equality-expression -# expr.xor -exclusive-or-expression := and-expression -exclusive-or-expression := exclusive-or-expression ^ and-expression -# expr.or -inclusive-or-expression := exclusive-or-expression -inclusive-or-expression := inclusive-or-expression | exclusive-or-expression -# expr.log.and -logical-and-expression := inclusive-or-expression -logical-and-expression := logical-and-expression && inclusive-or-expression -# expr.log.or -logical-or-expression := logical-and-expression -logical-or-expression := logical-or-expression || logical-and-expression -# expr.cond -conditional-expression := logical-or-expression -conditional-expression := logical-or-expression ? expression : assignment-expression -# expr.ass -yield-expression := CO_YIELD assignment-expression -yield-expression := CO_YIELD braced-init-list -throw-expression := THROW assignment-expression_opt -assignment-expression := conditional-expression -assignment-expression := yield-expression -assignment-expression := throw-expression -assignment-expression := logical-or-expression assignment-operator initializer-clause -assignment-operator := = -assignment-operator := *= -assignment-operator := /= -assignment-operator := %= -assignment-operator := += -assignment-operator := -= -assignment-operator := >>= -assignment-operator := <<= -assignment-operator := &= -assignment-operator := ^= -assignment-operator := |= -# expr.comma -expression := assignment-expression -expression := expression , assignment-expression -# expr.const -constant-expression := conditional-expression - -# gram.stmt -statement := labeled-statement -statement := expression-statement -statement := compound-statement -statement := selection-statement -statement := iteration-statement -statement := jump-statement -statement := declaration-statement -statement := try-block -init-statement := expression-statement -init-statement := simple-declaration -condition := expression -condition := decl-specifier-seq declarator brace-or-equal-initializer -labeled-statement := IDENTIFIER : statement -labeled-statement := CASE constant-expression : statement -labeled-statement := DEFAULT : statement -expression-statement := expression_opt ; -compound-statement := { statement-seq_opt [recover=Brackets] } -statement-seq := statement -statement-seq := statement-seq statement -selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement [guard] -selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement ELSE statement -selection-statement := SWITCH ( init-statement_opt condition ) statement -iteration-statement := WHILE ( condition ) statement -iteration-statement := DO statement WHILE ( expression ) ; -iteration-statement := FOR ( init-statement condition_opt ; expression_opt ) statement -iteration-statement := FOR ( init-statement_opt for-range-declaration : for-range-initializer ) statement -for-range-declaration := decl-specifier-seq declarator -for-range-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ] -for-range-initializer := expr-or-braced-init-list -jump-statement := BREAK ; -jump-statement := CONTINUE ; -jump-statement := RETURN expr-or-braced-init-list_opt ; -jump-statement := coroutine-return-statement -jump-statement := GOTO IDENTIFIER ; -coroutine-return-statement := CO_RETURN expr-or-braced-init-list_opt ; -declaration-statement := block-declaration - -# gram.dcl -declaration-seq := declaration -declaration-seq := declaration-seq declaration -declaration := block-declaration -declaration := nodeclspec-function-declaration -declaration := function-definition -declaration := template-declaration -declaration := deduction-guide -declaration := explicit-instantiation -declaration := explicit-specialization -declaration := export-declaration -declaration := linkage-specification -declaration := namespace-definition -declaration := empty-declaration -declaration := module-import-declaration -block-declaration := simple-declaration -block-declaration := asm-declaration -block-declaration := namespace-alias-definition -block-declaration := using-declaration -block-declaration := using-enum-declaration -block-declaration := using-directive -block-declaration := static_assert-declaration -block-declaration := alias-declaration -block-declaration := opaque-enum-declaration -nodeclspec-function-declaration := function-declarator ; -alias-declaration := USING IDENTIFIER = defining-type-id ; -simple-declaration := decl-specifier-seq init-declarator-list_opt ; -simple-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ] initializer ; [guard] -static_assert-declaration := STATIC_ASSERT ( constant-expression ) ; -static_assert-declaration := STATIC_ASSERT ( constant-expression , string-literal ) ; -empty-declaration := ; -# dcl.spec -decl-specifier := storage-class-specifier -decl-specifier := defining-type-specifier -decl-specifier := function-specifier -decl-specifier := FRIEND -decl-specifier := TYPEDEF -decl-specifier := CONSTEXPR -decl-specifier := CONSTEVAL -decl-specifier := CONSTINIT -decl-specifier := INLINE -decl-specifier-seq := decl-specifier -decl-specifier-seq := decl-specifier decl-specifier-seq [guard] -storage-class-specifier := STATIC -storage-class-specifier := THREAD_LOCAL -storage-class-specifier := EXTERN -storage-class-specifier := MUTABLE -function-specifier := VIRTUAL -function-specifier := explicit-specifier -explicit-specifier := EXPLICIT ( constant-expression ) -explicit-specifier := EXPLICIT -type-specifier := simple-type-specifier -type-specifier := elaborated-type-specifier -type-specifier := typename-specifier -type-specifier := cv-qualifier -type-specifier-seq := type-specifier -type-specifier-seq := type-specifier type-specifier-seq [guard] -defining-type-specifier := type-specifier -defining-type-specifier := class-specifier -defining-type-specifier := enum-specifier -defining-type-specifier-seq := defining-type-specifier -defining-type-specifier-seq := defining-type-specifier defining-type-specifier-seq [guard] -simple-type-specifier := nested-name-specifier_opt type-name -simple-type-specifier := nested-name-specifier TEMPLATE simple-template-id -simple-type-specifier := decltype-specifier -simple-type-specifier := placeholder-type-specifier -simple-type-specifier := nested-name-specifier_opt template-name -simple-type-specifier := SHORT -simple-type-specifier := LONG -simple-type-specifier := SIGNED -simple-type-specifier := UNSIGNED -simple-type-specifier := builtin-type -#! builtin-type added to aid in classifying which specifiers may combined. -builtin-type := CHAR -builtin-type := CHAR8_T -builtin-type := CHAR16_T -builtin-type := CHAR32_T -builtin-type := WCHAR_T -builtin-type := BOOL -builtin-type := INT -builtin-type := FLOAT -builtin-type := DOUBLE -builtin-type := VOID -#! Unlike C++ standard grammar, we don't distinguish the underlying type (class, -#! enum, typedef) of the IDENTIFIER, as these ambiguities are "local" and don't -#! affect the final parse tree. Eliminating them gives a significant performance -#! boost to the parser. -type-name := IDENTIFIER -type-name := simple-template-id -elaborated-type-specifier := class-key nested-name-specifier_opt IDENTIFIER -elaborated-type-specifier := class-key simple-template-id -elaborated-type-specifier := class-key nested-name-specifier TEMPLATE_opt simple-template-id -elaborated-type-specifier := elaborated-enum-specifier -elaborated-enum-specifier := ENUM nested-name-specifier_opt IDENTIFIER -decltype-specifier := DECLTYPE ( expression ) -placeholder-type-specifier := type-constraint_opt AUTO -placeholder-type-specifier := type-constraint_opt DECLTYPE ( AUTO ) -init-declarator-list := init-declarator -init-declarator-list := init-declarator-list , init-declarator -#! The standard grammar allows: -#! 1) an initializer with any declarator, including a function declarator, this -#! creates an ambiguity where a function definition is misparsed as a simple -#! declaration; -#! 2) an function-body with any declarator, includeing a non-function -#! declarator, this creates an ambiguity whwere a simple-declaration is -#! misparsed as a function-definition; -#! We extend the standard declarator to function-declarator and non-function-declarator -#! to eliminate these false parses. -init-declarator := non-function-declarator initializer_opt -init-declarator := function-declarator requires-clause_opt -function-declarator := declarator [guard] -non-function-declarator := declarator [guard] -declarator := ptr-declarator -declarator := noptr-declarator parameters-and-qualifiers trailing-return-type -ptr-declarator := noptr-declarator -ptr-declarator := ptr-operator ptr-declarator -noptr-declarator := declarator-id -noptr-declarator := noptr-declarator parameters-and-qualifiers -noptr-declarator := noptr-declarator [ constant-expression_opt ] -noptr-declarator := ( ptr-declarator ) -parameters-and-qualifiers := ( parameter-declaration-clause_opt [recover=Brackets] ) cv-qualifier-seq_opt ref-qualifier_opt noexcept-specifier_opt -trailing-return-type := -> type-id -ptr-operator := * cv-qualifier-seq_opt -ptr-operator := & -ptr-operator := && -ptr-operator := nested-name-specifier * cv-qualifier-seq_opt -cv-qualifier-seq := cv-qualifier cv-qualifier-seq_opt -cv-qualifier := CONST -cv-qualifier := VOLATILE -ref-qualifier := & -ref-qualifier := && -declarator-id := ..._opt id-expression -type-id := type-specifier-seq abstract-declarator_opt -defining-type-id := defining-type-specifier-seq abstract-declarator_opt -abstract-declarator := ptr-abstract-declarator -abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers trailing-return-type -abstract-declarator := abstract-pack-declarator -ptr-abstract-declarator := noptr-abstract-declarator -ptr-abstract-declarator := ptr-operator ptr-abstract-declarator_opt -noptr-abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers -noptr-abstract-declarator := noptr-abstract-declarator_opt [ constant-expression_opt ] -noptr-abstract-declarator := ( ptr-abstract-declarator ) -abstract-pack-declarator := noptr-abstract-pack-declarator -abstract-pack-declarator := ptr-operator abstract-pack-declarator -noptr-abstract-pack-declarator := noptr-abstract-pack-declarator parameters-and-qualifiers -noptr-abstract-pack-declarator := noptr-abstract-pack-declarator [ constant-expression_opt ] -noptr-abstract-pack-declarator := ... -#! Custom modifications to avoid nullable clause. -parameter-declaration-clause := parameter-declaration-list -parameter-declaration-clause := parameter-declaration-list_opt ... -parameter-declaration-clause := parameter-declaration-list , ... -parameter-declaration-list := parameter-declaration -parameter-declaration-list := parameter-declaration-list , parameter-declaration -parameter-declaration := decl-specifier-seq declarator -parameter-declaration := decl-specifier-seq declarator = initializer-clause -parameter-declaration := decl-specifier-seq abstract-declarator_opt -parameter-declaration := decl-specifier-seq abstract-declarator_opt = initializer-clause -# dcl.init -initializer := brace-or-equal-initializer -initializer := ( expression-list ) -brace-or-equal-initializer := = initializer-clause -brace-or-equal-initializer := braced-init-list -initializer-clause := assignment-expression -initializer-clause := braced-init-list -#! Allow mixed designated/non-designated init-list. -# This is standard C, and accepted by clang and others as an extension. -# FIXME: Decouple recovery from is-there-a-trailing-comma! -braced-init-list := { initializer-list [recover=Brackets] } -braced-init-list := { initializer-list , } -braced-init-list := { } -initializer-list := initializer-list-item -initializer-list := initializer-list , initializer-list-item -initializer-list-item := initializer-clause ..._opt -initializer-list-item := designator brace-or-equal-initializer ..._opt -designator := . IDENTIFIER -#! Array designators are legal in C, and a common extension in C++. -designator := [ expression ] -expr-or-braced-init-list := expression -expr-or-braced-init-list := braced-init-list -# dcl.fct -function-definition := decl-specifier-seq_opt function-declarator virt-specifier-seq_opt function-body -function-definition := decl-specifier-seq_opt function-declarator requires-clause function-body -function-body := ctor-initializer_opt compound-statement -function-body := function-try-block -function-body := = DEFAULT ; -function-body := = DELETE ; -# dcl.enum -enum-specifier := enum-head { enumerator-list_opt } -enum-specifier := enum-head { enumerator-list , } -enum-head := enum-key enum-head-name_opt enum-base_opt -enum-head-name := nested-name-specifier_opt IDENTIFIER -opaque-enum-declaration := enum-key enum-head-name enum-base_opt ; -enum-key := ENUM -enum-key := ENUM CLASS -enum-key := ENUM STRUCT -enum-base := : type-specifier-seq -enumerator-list := enumerator-definition -enumerator-list := enumerator-list , enumerator-definition -enumerator-definition := enumerator -enumerator-definition := enumerator = constant-expression -enumerator := IDENTIFIER -using-enum-declaration := USING elaborated-enum-specifier ; -# basic.namespace -namespace-definition := named-namespace-definition -namespace-definition := unnamed-namespace-definition -namespace-definition := nested-namespace-definition -named-namespace-definition := INLINE_opt NAMESPACE IDENTIFIER { namespace-body_opt } -unnamed-namespace-definition := INLINE_opt NAMESPACE { namespace-body_opt } -nested-namespace-definition := NAMESPACE enclosing-namespace-specifier :: INLINE_opt IDENTIFIER { namespace-body } -enclosing-namespace-specifier := IDENTIFIER -enclosing-namespace-specifier := enclosing-namespace-specifier :: INLINE_opt IDENTIFIER -#! Custom modification to avoid nullable namespace-body. -namespace-body := declaration-seq -namespace-alias-definition := NAMESPACE IDENTIFIER = qualified-namespace-specifier ; -qualified-namespace-specifier := nested-name-specifier_opt namespace-name -using-directive := USING NAMESPACE nested-name-specifier_opt namespace-name ; -using-declaration := USING using-declarator-list ; -using-declarator-list := using-declarator ..._opt -using-declarator-list := using-declarator-list , using-declarator ..._opt -using-declarator := TYPENAME_opt nested-name-specifier unqualified-id -# dcl.asm -asm-declaration := ASM ( string-literal ) ; -# dcl.link -linkage-specification := EXTERN string-literal { declaration-seq_opt } -linkage-specification := EXTERN string-literal declaration - -# gram.module -module-declaration := export-keyword_opt module-keyword module-name module-partition_opt ; -module-name := module-name-qualifier_opt IDENTIFIER -module-partition := : module-name-qualifier_opt IDENTIFIER -module-name-qualifier := IDENTIFIER . -module-name-qualifier := module-name-qualifier IDENTIFIER . -export-declaration := EXPORT declaration -export-declaration := EXPORT { declaration-seq_opt } -export-declaration := export-keyword module-import-declaration -module-import-declaration := import-keyword module-name ; -module-import-declaration := import-keyword module-partition ; -# FIXME: we don't have header-name in the grammar. Handle these in PP? -# module-import-declaration := import-keyword header-name ; -global-module-fragment := module-keyword ; declaration-seq_opt -private-module-fragment := module-keyword : PRIVATE ; declaration-seq_opt - -# gram.class -class-specifier := class-head { member-specification_opt [recover=Brackets] } -class-head := class-key class-head-name class-virt-specifier_opt base-clause_opt -class-head := class-key base-clause_opt -class-head-name := nested-name-specifier_opt type-name -class-virt-specifier := contextual-final -class-key := CLASS -class-key := STRUCT -class-key := UNION -member-specification := member-declaration member-specification_opt -member-specification := access-specifier : member-specification_opt -member-declaration := decl-specifier-seq member-declarator-list_opt ; -member-declaration := member-declarator-list ; -member-declaration := function-definition -member-declaration := using-declaration -member-declaration := using-enum-declaration -member-declaration := static_assert-declaration -member-declaration := template-declaration -member-declaration := explicit-specialization -member-declaration := deduction-guide -member-declaration := alias-declaration -member-declaration := opaque-enum-declaration -member-declaration := empty-declaration -member-declarator-list := member-declarator -member-declarator-list := member-declarator-list , member-declarator -member-declarator := function-declarator virt-specifier-seq_opt pure-specifier_opt -member-declarator := function-declarator requires-clause -member-declarator := non-function-declarator brace-or-equal-initializer_opt -member-declarator := IDENTIFIER_opt : constant-expression brace-or-equal-initializer_opt -virt-specifier-seq := virt-specifier -virt-specifier-seq := virt-specifier-seq virt-specifier -virt-specifier := contextual-override -virt-specifier := contextual-final -pure-specifier := = contextual-zero -conversion-function-id := OPERATOR conversion-type-id -conversion-type-id := type-specifier-seq conversion-declarator_opt -conversion-declarator := ptr-operator conversion-declarator_opt -base-clause := : base-specifier-list -base-specifier-list := base-specifier ..._opt -base-specifier-list := base-specifier-list , base-specifier ..._opt -base-specifier := class-or-decltype -base-specifier := VIRTUAL access-specifier_opt class-or-decltype -base-specifier := access-specifier VIRTUAL_opt class-or-decltype -class-or-decltype := nested-name-specifier_opt type-name -class-or-decltype := nested-name-specifier TEMPLATE simple-template-id -class-or-decltype := decltype-specifier -access-specifier := PRIVATE -access-specifier := PROTECTED -access-specifier := PUBLIC -ctor-initializer := : mem-initializer-list -mem-initializer-list := mem-initializer ..._opt -mem-initializer-list := mem-initializer-list , mem-initializer ..._opt -mem-initializer := mem-initializer-id ( expression-list_opt ) -mem-initializer := mem-initializer-id braced-init-list -mem-initializer-id := class-or-decltype -mem-initializer-id := IDENTIFIER - -# gram.over -operator-function-id := OPERATOR operator-name -operator-name := NEW -operator-name := DELETE -operator-name := NEW [ ] -operator-name := DELETE [ ] -operator-name := CO_AWAIT -operator-name := ( ) -operator-name := [ ] -operator-name := -> -operator-name := ->* -operator-name := ~ -operator-name := ! -operator-name := + -operator-name := - -operator-name := * -operator-name := / -operator-name := % -operator-name := ^ -operator-name := & -operator-name := | -operator-name := = -operator-name := += -operator-name := -= -operator-name := *= -operator-name := /= -operator-name := %= -operator-name := ^= -operator-name := &= -operator-name := |= -operator-name := == -operator-name := != -operator-name := < -operator-name := > -operator-name := <= -operator-name := >= -operator-name := <=> -operator-name := ^^ -operator-name := || -operator-name := << -operator-name := greatergreater -operator-name := <<= -operator-name := >>= -operator-name := ++ -operator-name := -- -operator-name := , -literal-operator-id := OPERATOR string-literal IDENTIFIER -literal-operator-id := OPERATOR user-defined-string-literal - -# gram.temp -template-declaration := template-head declaration -template-declaration := template-head concept-definition -template-head := TEMPLATE < template-parameter-list > requires-clause_opt -template-parameter-list := template-parameter -template-parameter-list := template-parameter-list , template-parameter -requires-clause := REQUIRES constraint-logical-or-expression -constraint-logical-or-expression := constraint-logical-and-expression -constraint-logical-or-expression := constraint-logical-or-expression || constraint-logical-and-expression -constraint-logical-and-expression := primary-expression -constraint-logical-and-expression := constraint-logical-and-expression && primary-expression -template-parameter := type-parameter -template-parameter := parameter-declaration -type-parameter := type-parameter-key ..._opt IDENTIFIER_opt -type-parameter := type-parameter-key IDENTIFIER_opt = type-id -type-parameter := type-constraint ..._opt IDENTIFIER_opt -type-parameter := type-constraint IDENTIFIER_opt = type-id -type-parameter := template-head type-parameter-key ..._opt IDENTIFIER_opt -type-parameter := template-head type-parameter-key IDENTIFIER_opt = id-expression -type-parameter-key := CLASS -type-parameter-key := TYPENAME -type-constraint := nested-name-specifier_opt concept-name -type-constraint := nested-name-specifier_opt concept-name < template-argument-list_opt > -simple-template-id := template-name < template-argument-list_opt > -template-id := simple-template-id -template-id := operator-function-id < template-argument-list_opt > -template-id := literal-operator-id < template-argument-list_opt > -template-argument-list := template-argument ..._opt -template-argument-list := template-argument-list , template-argument ..._opt -template-argument := constant-expression -template-argument := type-id -template-argument := id-expression -constraint-expression := logical-or-expression -deduction-guide := explicit-specifier_opt template-name ( parameter-declaration-list_opt ) -> simple-template-id ; -concept-definition := CONCEPT concept-name = constraint-expression ; -concept-name := IDENTIFIER -typename-specifier := TYPENAME nested-name-specifier IDENTIFIER -typename-specifier := TYPENAME nested-name-specifier TEMPLATE_opt simple-template-id -explicit-instantiation := EXTERN_opt TEMPLATE declaration -explicit-specialization := TEMPLATE < > declaration - -# gram.except -try-block := TRY compound-statement handler-seq -function-try-block := TRY ctor-initializer_opt compound-statement handler-seq -handler-seq := handler handler-seq_opt -handler := CATCH ( exception-declaration ) compound-statement -exception-declaration := type-specifier-seq declarator -exception-declaration := type-specifier-seq abstract-declarator_opt -noexcept-specifier := NOEXCEPT ( constant-expression ) -noexcept-specifier := NOEXCEPT - -# gram.cpp -identifier-list := IDENTIFIER -identifier-list := identifier-list , IDENTIFIER - -# gram.lex -#! As we use clang lexer, most of lexical symbols are not needed, we only add -#! needed literals. -literal := integer-literal -literal := character-literal -literal := floating-point-literal -literal := string-literal -literal := boolean-literal -literal := pointer-literal -literal := user-defined-literal -integer-literal := NUMERIC_CONSTANT [guard] -character-literal := CHAR_CONSTANT [guard] -character-literal := WIDE_CHAR_CONSTANT [guard] -character-literal := UTF8_CHAR_CONSTANT [guard] -character-literal := UTF16_CHAR_CONSTANT [guard] -character-literal := UTF32_CHAR_CONSTANT [guard] -floating-point-literal := NUMERIC_CONSTANT [guard] -string-literal-chunk := STRING_LITERAL [guard] -string-literal-chunk := WIDE_STRING_LITERAL [guard] -string-literal-chunk := UTF8_STRING_LITERAL [guard] -string-literal-chunk := UTF16_STRING_LITERAL [guard] -string-literal-chunk := UTF32_STRING_LITERAL [guard] -#! Technically, string concatenation happens at phase 6 which is before parsing, -#! so it doesn't belong to the grammar. However, we extend the grammar to -#! support it, to make the pseudoparser fully functional on practical code. -string-literal := string-literal-chunk -string-literal := string-literal string-literal-chunk -user-defined-literal := user-defined-integer-literal -user-defined-literal := user-defined-floating-point-literal -user-defined-literal := user-defined-string-literal -user-defined-literal := user-defined-character-literal -user-defined-integer-literal := NUMERIC_CONSTANT [guard] -user-defined-string-literal-chunk := STRING_LITERAL [guard] -user-defined-string-literal-chunk := WIDE_STRING_LITERAL [guard] -user-defined-string-literal-chunk := UTF8_STRING_LITERAL [guard] -user-defined-string-literal-chunk := UTF16_STRING_LITERAL [guard] -user-defined-string-literal-chunk := UTF32_STRING_LITERAL [guard] -user-defined-string-literal := user-defined-string-literal-chunk -user-defined-string-literal := string-literal-chunk user-defined-string-literal -user-defined-string-literal := user-defined-string-literal string-literal-chunk -user-defined-floating-point-literal := NUMERIC_CONSTANT [guard] -user-defined-character-literal := CHAR_CONSTANT [guard] -user-defined-character-literal := WIDE_CHAR_CONSTANT [guard] -user-defined-character-literal := UTF8_CHAR_CONSTANT [guard] -user-defined-character-literal := UTF16_CHAR_CONSTANT [guard] -user-defined-character-literal := UTF32_CHAR_CONSTANT [guard] -boolean-literal := FALSE -boolean-literal := TRUE -pointer-literal := NULLPTR - -#! Contextual keywords -- clang lexer always lexes them as identifier tokens. -#! Placeholders for literal text in the grammar that lex as other things. -contextual-override := IDENTIFIER [guard] -contextual-final := IDENTIFIER [guard] -contextual-zero := NUMERIC_CONSTANT [guard] -module-keyword := IDENTIFIER [guard] -import-keyword := IDENTIFIER [guard] -export-keyword := IDENTIFIER [guard] - -#! greatergreater token -- clang lexer always lexes it as a single token, we -#! split it into two tokens to make the GLR parser aware of the nested-template -#! case. -greatergreater := > > - -#! C++ predefined identifier, __func__ [dcl.fct.def.general] p8 -#! FIXME: add other (MSVC, GNU extension) predefined identifiers. -primary-expression := predefined-expression -predefined-expression := __FUNC__ diff --git a/clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt b/clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt deleted file mode 100644 index bb08ebab0fa62..0000000000000 --- a/clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -set(LLVM_LINK_COMPONENTS Support) - -add_clang_library(clangPseudoGrammar - Grammar.cpp - GrammarBNF.cpp - LRGraph.cpp - LRTable.cpp - LRTableBuild.cpp - ) - diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp deleted file mode 100644 index 3e9c5c3c7a6c4..0000000000000 --- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp +++ /dev/null @@ -1,190 +0,0 @@ -//===--- Grammar.cpp - Grammar for clang pseudoparser -----------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/grammar/Grammar.h" -#include "clang/Basic/TokenKinds.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/raw_ostream.h" -#include - -namespace clang { -namespace pseudo { - -Rule::Rule(SymbolID Target, llvm::ArrayRef Sequence) - : Target(Target), Size(static_cast(Sequence.size())) { - assert(Sequence.size() <= Rule::MaxElements); - llvm::copy(Sequence, this->Sequence); -} - -Grammar::Grammar(std::unique_ptr Table) : T(std::move(Table)) { - Underscore = *findNonterminal("_"); -} - -llvm::ArrayRef Grammar::rulesFor(SymbolID SID) const { - assert(isNonterminal(SID)); - const auto &R = T->Nonterminals[SID].RuleRange; - assert(R.End <= T->Rules.size()); - return llvm::ArrayRef(&T->Rules[R.Start], R.End - R.Start); -} - -const Rule &Grammar::lookupRule(RuleID RID) const { - assert(RID < T->Rules.size()); - return T->Rules[RID]; -} - -llvm::StringRef Grammar::symbolName(SymbolID SID) const { - if (isToken(SID)) - return T->Terminals[symbolToToken(SID)]; - return T->Nonterminals[SID].Name; -} - -std::optional Grammar::findNonterminal(llvm::StringRef Name) const { - auto It = llvm::partition_point( - T->Nonterminals, - [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; }); - if (It != T->Nonterminals.end() && It->Name == Name) - return It - T->Nonterminals.begin(); - return std::nullopt; -} - -std::string Grammar::dumpRule(RuleID RID) const { - std::string Result; - llvm::raw_string_ostream OS(Result); - const Rule &R = T->Rules[RID]; - OS << symbolName(R.Target) << " :="; - for (unsigned I = 0; I < R.Size; ++I) { - OS << " " << symbolName(R.Sequence[I]); - if (R.RecoveryIndex == I) - OS << " [recover=" << T->AttributeValues[R.Recovery] << "]"; - } - if (R.Guarded) - OS << " [guard]"; - return Result; -} - -std::string Grammar::dumpRules(SymbolID SID) const { - assert(isNonterminal(SID)); - std::string Result; - const auto &Range = T->Nonterminals[SID].RuleRange; - for (RuleID RID = Range.Start; RID < Range.End; ++RID) - Result.append(dumpRule(RID)).push_back('\n'); - return Result; -} - -std::string Grammar::dump() const { - std::string Result; - llvm::raw_string_ostream OS(Result); - OS << "Nonterminals:\n"; - for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) - OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID)); - OS << "Rules:\n"; - for (RuleID RID = 0; RID < T->Rules.size(); ++RID) - OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID)); - return OS.str(); -} - -std::vector> firstSets(const Grammar &G) { - std::vector> FirstSets( - G.table().Nonterminals.size()); - auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) { - assert(isNonterminal(Target)); - if (isToken(First)) - return FirstSets[Target].insert(First).second; - bool Changed = false; - for (SymbolID SID : FirstSets[First]) - Changed |= FirstSets[Target].insert(SID).second; - return Changed; - }; - - // A rule S := T ... implies elements in FIRST(S): - // - if T is a terminal, FIRST(S) contains T - // - if T is a nonterminal, FIRST(S) contains FIRST(T) - // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may - // end up being incomplete. - // We iterate until we hit a fixed point. - // (This isn't particularly efficient, but table building isn't on the - // critical path). - bool Changed = true; - while (Changed) { - Changed = false; - for (const auto &R : G.table().Rules) - // We only need to consider the first element because symbols are - // non-nullable. - Changed |= ExpandFirstSet(R.Target, R.seq().front()); - } - return FirstSets; -} - -std::vector> followSets(const Grammar &G) { - auto FirstSets = firstSets(G); - std::vector> FollowSets( - G.table().Nonterminals.size()); - // Expand the follow set of a nonterminal symbol Y by adding all from the - // given symbol set. - auto ExpandFollowSet = [&FollowSets](SymbolID Y, - const llvm::DenseSet &ToAdd) { - assert(isNonterminal(Y)); - bool Changed = false; - for (SymbolID F : ToAdd) - Changed |= FollowSets[Y].insert(F).second; - return Changed; - }; - // Follow sets is computed based on the following 3 rules, the computation - // is completed at a fixed point where there is no more new symbols can be - // added to any of the follow sets. - // - // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the - // augmented grammar, in our case it is '_'. - FollowSets[G.underscore()].insert(tokenSymbol(tok::eof)); - bool Changed = true; - while (Changed) { - Changed = false; - for (const auto &R : G.table().Rules) { - // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to - // FOLLOW(Y). - for (size_t I = 0; I + 1 < R.seq().size(); ++I) { - if (isToken(R.seq()[I])) - continue; - // We only need to consider the next symbol because symbols are - // non-nullable. - SymbolID Next = R.seq()[I + 1]; - if (isToken(Next)) - // First set for a terminal is itself. - Changed |= ExpandFollowSet(R.seq()[I], {Next}); - else - Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]); - } - // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to - // FOLLOW(Z). - SymbolID Z = R.seq().back(); - if (isNonterminal(Z)) - Changed |= ExpandFollowSet(Z, FollowSets[R.Target]); - } - } - return FollowSets; -} - -static llvm::ArrayRef getTerminalNames() { - static const auto &TerminalNames = []() { - auto TerminalNames = new std::string[NumTerminals]; -#define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling; -#define KEYWORD(Keyword, Condition) \ - TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper(); -#define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper(); -#include "clang/Basic/TokenKinds.def" - return llvm::ArrayRef(TerminalNames, NumTerminals); - }(); - return TerminalNames; -} -GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {} - -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp deleted file mode 100644 index f1b8e06e22432..0000000000000 --- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp +++ /dev/null @@ -1,362 +0,0 @@ -//===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/grammar/Grammar.h" -#include "clang/Basic/TokenKinds.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/FormatVariadic.h" -#include -#include - -namespace clang { -namespace pseudo { - -namespace { -static const llvm::StringRef OptSuffix = "_opt"; -static const llvm::StringRef StartSymbol = "_"; - -// Builds grammar from BNF files. -class GrammarBuilder { -public: - GrammarBuilder(std::vector &Diagnostics) - : Diagnostics(Diagnostics) {} - - Grammar build(llvm::StringRef BNF) { - auto Specs = eliminateOptional(parse(BNF)); - - assert(llvm::all_of(Specs, - [](const RuleSpec &R) { - if (R.Target.ends_with(OptSuffix)) - return false; - return llvm::all_of( - R.Sequence, [](const RuleSpec::Element &E) { - return !E.Symbol.ends_with(OptSuffix); - }); - }) && - "Optional symbols should be eliminated!"); - - auto T = std::make_unique(); - - // Assemble the name->ID and ID->nonterminal name maps. - llvm::DenseSet UniqueNonterminals; - llvm::DenseMap SymbolIds; - - llvm::DenseSet UniqueAttributeValues; - - for (uint16_t I = 0; I < NumTerminals; ++I) - SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I))); - auto Consider = [&](llvm::StringRef Name) { - if (!SymbolIds.count(Name)) - UniqueNonterminals.insert(Name); - }; - for (const auto &Spec : Specs) { - Consider(Spec.Target); - for (const RuleSpec::Element &Elt : Spec.Sequence) { - Consider(Elt.Symbol); - for (const auto& KV : Elt.Attributes) - UniqueAttributeValues.insert(KV.second); - } - } - for (llvm::StringRef Name : UniqueNonterminals) { - T->Nonterminals.emplace_back(); - T->Nonterminals.back().Name = Name.str(); - } - assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) && - "Too many nonterminals to fit in SymbolID bits!"); - llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L, - const GrammarTable::Nonterminal &R) { - return L.Name < R.Name; - }); - // Add an empty string for the corresponding sentinel unset attribute. - T->AttributeValues.push_back(""); - UniqueAttributeValues.erase(""); - for (llvm::StringRef Name : UniqueAttributeValues) { - T->AttributeValues.emplace_back(); - T->AttributeValues.back() = Name.str(); - } - llvm::sort(T->AttributeValues); - assert(T->AttributeValues.front() == ""); - - // Build name -> ID maps for nonterminals. - for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) - SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID); - - // Convert the rules. - T->Rules.reserve(Specs.size()); - std::vector Symbols; - auto Lookup = [SymbolIds](llvm::StringRef Name) { - auto It = SymbolIds.find(Name); - assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!"); - return It->second; - }; - for (const auto &Spec : Specs) { - assert(Spec.Sequence.size() <= Rule::MaxElements); - Symbols.clear(); - for (const RuleSpec::Element &Elt : Spec.Sequence) - Symbols.push_back(Lookup(Elt.Symbol)); - T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols)); - applyAttributes(Spec, *T, T->Rules.back()); - } - - assert(T->Rules.size() < (1 << RuleBits) && - "Too many rules to fit in RuleID bits!"); - const auto &SymbolOrder = getTopologicalOrder(T.get()); - llvm::stable_sort( - T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) { - // Sorted by the topological order of the nonterminal Target. - return SymbolOrder[Left.Target] < SymbolOrder[Right.Target]; - }); - for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) { - auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) { - return SymbolOrder[R.Target] < SymbolOrder[SID]; - }); - RuleID Start = StartIt - T->Rules.begin(); - RuleID End = Start; - while (End < T->Rules.size() && T->Rules[End].Target == SID) - ++End; - T->Nonterminals[SID].RuleRange = {Start, End}; - } - Grammar G(std::move(T)); - diagnoseGrammar(G); - return G; - } - - // Gets topological order for nonterminal symbols. - // - // The topological order is defined as: if a *single* nonterminal A produces - // (or transitively) a nonterminal B (that said, there is a production rule - // B := A), then A is less than B. - // - // It returns the sort key for each symbol, the array is indexed by SymbolID. - std::vector getTopologicalOrder(GrammarTable *T) { - std::vector> Dependencies; - for (const auto &Rule : T->Rules) { - // if A := B, A depends on B. - if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0])) - Dependencies.push_back({Rule.Target, Rule.Sequence[0]}); - } - llvm::sort(Dependencies); - std::vector Order; - // Each nonterminal state flows: NotVisited -> Visiting -> Visited. - enum State { - NotVisited, - Visiting, - Visited, - }; - std::vector VisitStates(T->Nonterminals.size(), NotVisited); - std::function DFS = [&](SymbolID SID) -> void { - if (VisitStates[SID] == Visited) - return; - if (VisitStates[SID] == Visiting) { - Diagnostics.push_back( - llvm::formatv("The grammar contains a cycle involving symbol {0}", - T->Nonterminals[SID].Name)); - return; - } - VisitStates[SID] = Visiting; - for (auto It = llvm::lower_bound(Dependencies, - std::pair{SID, 0}); - It != Dependencies.end() && It->first == SID; ++It) - DFS(It->second); - VisitStates[SID] = Visited; - Order.push_back(SID); - }; - for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID) - DFS(ID); - std::vector Result(T->Nonterminals.size(), 0); - for (size_t I = 0; I < Order.size(); ++I) - Result[Order[I]] = I; - return Result; - } - -private: - // Text representation of a BNF grammar rule. - struct RuleSpec { - llvm::StringRef Target; - struct Element { - llvm::StringRef Symbol; // Name of the symbol - // Attributes that are associated to the sequence symbol or rule. - std::vector> - Attributes; - }; - std::vector Sequence; - - std::string toString() const { - std::vector Body; - for (const auto &E : Sequence) - Body.push_back(E.Symbol); - return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " ")); - } - }; - - std::vector parse(llvm::StringRef Lines) { - std::vector Specs; - for (llvm::StringRef Line : llvm::split(Lines, '\n')) { - Line = Line.trim(); - // Strip anything coming after the '#' (comment). - Line = Line.take_while([](char C) { return C != '#'; }); - if (Line.empty()) - continue; - RuleSpec Rule; - if (parseLine(Line, Rule)) - Specs.push_back(std::move(Rule)); - } - return Specs; - } - - bool parseLine(llvm::StringRef Line, RuleSpec &Out) { - auto Parts = Line.split(":="); - if (Parts.first == Line) { // no separator in Line - Diagnostics.push_back( - llvm::formatv("Failed to parse '{0}': no separator :=", Line).str()); - return false; - } - - Out.Target = Parts.first.trim(); - Out.Sequence.clear(); - for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) { - Chunk = Chunk.trim(); - if (Chunk.empty()) - continue; // skip empty - if (Chunk.starts_with("[") && Chunk.ends_with("]")) { - if (Out.Sequence.empty()) - continue; - - parseAttributes(Chunk, Out.Sequence.back().Attributes); - continue; - } - - Out.Sequence.push_back({Chunk, /*Attributes=*/{}}); - } - return true; - } - - bool parseAttributes( - llvm::StringRef Content, - std::vector> &Out) { - assert(Content.starts_with("[") && Content.ends_with("]")); - auto KV = Content.drop_front().drop_back().split('='); - Out.push_back({KV.first, KV.second.trim()}); - - return true; - } - // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule. - void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) { - auto LookupExtensionID = [&T](llvm::StringRef Name) { - const auto It = llvm::partition_point( - T.AttributeValues, [&](llvm::StringRef X) { return X < Name; }); - assert(It != T.AttributeValues.end() && *It == Name && - "Didn't find the attribute in AttrValues!"); - return It - T.AttributeValues.begin(); - }; - for (unsigned I = 0; I < Spec.Sequence.size(); ++I) { - for (const auto &KV : Spec.Sequence[I].Attributes) { - if (KV.first == "guard") { - R.Guarded = true; - } else if (KV.first == "recover") { - R.Recovery = LookupExtensionID(KV.second); - R.RecoveryIndex = I; - } else { - Diagnostics.push_back( - llvm::formatv("Unknown attribute '{0}'", KV.first).str()); - } - } - } - } - - // Inlines all _opt symbols. - // For example, a rule E := id +_opt id, after elimination, we have two - // equivalent rules: - // 1) E := id + id - // 2) E := id id - std::vector eliminateOptional(llvm::ArrayRef Input) { - std::vector Results; - std::vector Storage; - for (const auto &R : Input) { - eliminateOptionalTail( - R.Sequence, Storage, [&Results, &Storage, &R, this]() { - if (Storage.empty()) { - Diagnostics.push_back( - llvm::formatv("Rule '{0}' has a nullable RHS", R.toString())); - return; - } - Results.push_back({R.Target, Storage}); - }); - assert(Storage.empty()); - } - return Results; - } - void eliminateOptionalTail(llvm::ArrayRef Elements, - std::vector &Result, - llvm::function_ref CB) { - if (Elements.empty()) - return CB(); - auto Front = Elements.front(); - if (!Front.Symbol.ends_with(OptSuffix)) { - Result.push_back(std::move(Front)); - eliminateOptionalTail(Elements.drop_front(1), Result, CB); - Result.pop_back(); - return; - } - // Enumerate two options: skip the opt symbol, or inline the symbol. - eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip - Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt" - Result.push_back(std::move(Front)); - eliminateOptionalTail(Elements.drop_front(1), Result, CB); - Result.pop_back(); - } - - // Diagnoses the grammar and emit warnings if any. - void diagnoseGrammar(const Grammar &G) { - const auto &T = G.table(); - for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) { - auto Range = T.Nonterminals[SID].RuleRange; - if (Range.Start == Range.End) - Diagnostics.push_back( - llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID))); - llvm::StringRef NameRef = T.Nonterminals[SID].Name; - if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) { - Diagnostics.push_back(llvm::formatv( - "Token-like name {0} is used as a nonterminal", G.symbolName(SID))); - } - } - llvm::DenseSet VisitedRules; - for (RuleID RID = 0; RID < T.Rules.size(); ++RID) { - const auto &R = T.Rules[RID]; - auto Code = llvm::hash_combine( - R.Target, llvm::hash_combine_range(R.seq().begin(), R.seq().end())); - auto [_, New] = VisitedRules.insert(Code); - if (!New) - Diagnostics.push_back( - llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID))); - } - // symbol-id -> used counts - std::vector UseCounts(T.Nonterminals.size(), 0); - for (const Rule &R : T.Rules) - for (SymbolID SID : R.seq()) - if (isNonterminal(SID)) - ++UseCounts[SID]; - for (SymbolID SID = 0; SID < UseCounts.size(); ++SID) - if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol) - Diagnostics.push_back( - llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID))); - } - std::vector &Diagnostics; -}; -} // namespace - -Grammar Grammar::parseBNF(llvm::StringRef BNF, - std::vector &Diagnostics) { - Diagnostics.clear(); - return GrammarBuilder(Diagnostics).build(BNF); -} - -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp deleted file mode 100644 index 82c7cc7d8b293..0000000000000 --- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp +++ /dev/null @@ -1,265 +0,0 @@ -//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/grammar/LRGraph.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/Hashing.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/raw_ostream.h" - -using ItemSet = std::vector; - -namespace llvm { -// Support clang::pseudo::Item as DenseMap keys. -template <> struct DenseMapInfo { - static inline ItemSet getEmptyKey() { - return {DenseMapInfo::getEmptyKey()}; - } - static inline ItemSet getTombstoneKey() { - return {DenseMapInfo::getTombstoneKey()}; - } - static unsigned getHashValue(const ItemSet &I) { - return llvm::hash_combine_range(I.begin(), I.end()); - } - static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) { - return LHS == RHS; - } -}; -} // namespace llvm - -namespace clang { -namespace pseudo { -namespace { - -struct SortByNextSymbol { - SortByNextSymbol(const Grammar &G) : G(G) {} - bool operator()(const Item &L, const Item &R) { - if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G)) - return L.next(G) < R.next(G); - if (L.hasNext() != R.hasNext()) - return L.hasNext() < R.hasNext(); // a trailing dot is minimal. - return L < R; - } - const Grammar &G; -}; - -// Computes a closure of the given item set S: -// - extends the given S to contain all options for parsing next token; -// - nonterminals after a dot are recursively expanded into the begin-state -// of all production rules that produce that nonterminal; -// -// Given -// Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ] -// Input = [ E := . T ] -// returns [ E := . T, T := . n, T := . ( E ) ] -State closure(ItemSet Queue, const Grammar &G) { - llvm::DenseSet InQueue = {Queue.begin(), Queue.end()}; - // We reuse the passed-by-value Queue as the final result, as it's already - // initialized to the right elements. - size_t ItIndex = 0; - while (ItIndex < Queue.size()) { - const Item &ExpandingItem = Queue[ItIndex]; - ++ItIndex; - if (!ExpandingItem.hasNext()) - continue; - - SymbolID NextSym = ExpandingItem.next(G); - if (pseudo::isToken(NextSym)) - continue; - auto RRange = G.table().Nonterminals[NextSym].RuleRange; - for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) { - Item NewItem = Item::start(RID, G); - if (InQueue.insert(NewItem).second) // new - Queue.push_back(std::move(NewItem)); - } - } - Queue.shrink_to_fit(); - llvm::sort(Queue, SortByNextSymbol(G)); - return {std::move(Queue)}; -} - -// Returns all next (with a dot advanced) kernel item sets, partitioned by the -// advanced symbol. -// -// Given -// S = [ E := . a b, E := E . - T ] -// returns [ -// {id(a), [ E := a . b ]}, -// {id(-), [ E := E - . T ]} -// ] -std::vector> -nextAvailableKernelItems(const State &S, const Grammar &G) { - std::vector> Results; - llvm::ArrayRef AllItems = S.Items; - AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); }); - while (!AllItems.empty()) { - SymbolID AdvancedSymbol = AllItems.front().next(G); - auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) { - assert(I.hasNext()); - return I.next(G) == AdvancedSymbol; - }); - assert(!Batch.empty()); - AllItems = AllItems.drop_front(Batch.size()); - - // Advance a dot over the Symbol. - ItemSet Next; - for (const Item &I : Batch) - Next.push_back(I.advance()); - // sort the set to keep order determinism for hash computation. - llvm::sort(Next); - Results.push_back({AdvancedSymbol, std::move(Next)}); - } - return Results; -} - -std::vector> -availableRecovery(const State &S, const Grammar &G) { - std::vector> Result; - for (const Item &I : S.Items) { - const auto &Rule = G.lookupRule(I.rule()); - if (I.dot() != Rule.RecoveryIndex) - continue; - Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]}); - } - llvm::sort(Result); - Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); - return Result; -} - -} // namespace - -std::string Item::dump(const Grammar &G) const { - const auto &Rule = G.lookupRule(RID); - auto ToNames = [&](llvm::ArrayRef Syms) { - std::vector Results; - for (auto SID : Syms) - Results.push_back(G.symbolName(SID)); - return Results; - }; - return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target), - llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "), - llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "), - Rule.RecoveryIndex == DotPos ? " [recovery]" : "") - .str(); -} - -std::string State::dump(const Grammar &G, unsigned Indent) const { - std::string Result; - llvm::raw_string_ostream OS(Result); - for (const auto &Item : Items) - OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G)); - return OS.str(); -} - -std::string LRGraph::dumpForTests(const Grammar &G) const { - std::string Result; - llvm::raw_string_ostream OS(Result); - OS << "States:\n"; - for (StateID ID = 0; ID < States.size(); ++ID) { - OS << llvm::formatv("State {0}\n", ID); - OS << States[ID].dump(G, /*Indent*/ 4); - } - for (const auto &E : Edges) { - OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label), - E.Dst); - } - return OS.str(); -} - -LRGraph LRGraph::buildLR0(const Grammar &G) { - class Builder { - public: - Builder(const Grammar &G) : G(G) {} - - // Adds a given state if not existed. - std::pair insert(ItemSet KernelItems) { - assert(llvm::is_sorted(KernelItems) && - "Item must be sorted before inserting to a hash map!"); - auto It = StatesIndex.find(KernelItems); - if (It != StatesIndex.end()) - return {It->second, false}; - States.push_back(closure(KernelItems, G)); - StateID NextStateID = States.size() - 1; - StatesIndex.insert({std::move(KernelItems), NextStateID}); - return {NextStateID, true}; - } - - void insertEdge(StateID Src, StateID Dst, SymbolID Label) { - Edges.push_back({Src, Dst, Label}); - } - - void insertRecovery(StateID Src, ExtensionID Strategy, SymbolID Result) { - Recoveries.push_back({Src, Strategy, Result}); - } - - // Returns a state with the given id. - const State &find(StateID ID) const { - assert(ID < States.size()); - return States[ID]; - } - - void addStartState(SymbolID Sym, StateID State) { - StartStates.push_back({Sym, State}); - } - - LRGraph build() && { - States.shrink_to_fit(); - Edges.shrink_to_fit(); - Recoveries.shrink_to_fit(); - llvm::sort(StartStates); - StartStates.shrink_to_fit(); - return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries), - std::move(StartStates)); - } - - private: - // Key is the **kernel** item sets. - llvm::DenseMap StatesIndex; - std::vector States; - std::vector Edges; - std::vector Recoveries; - const Grammar &G; - std::vector> StartStates; - } Builder(G); - - std::vector PendingStates; - // Initialize states with the start symbol. - auto RRange = G.table().Nonterminals[G.underscore()].RuleRange; - for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) { - auto StartState = std::vector{Item::start(RID, G)}; - auto Result = Builder.insert(std::move(StartState)); - assert(Result.second && "State must be new"); - PendingStates.push_back(Result.first); - - const Rule &StartRule = G.lookupRule(RID); - assert(StartRule.Size == 2 && - StartRule.seq().back() == tokenSymbol(tok::eof) && - "Start rule must be of the form `_ := start-symbol EOF`!"); - Builder.addStartState(StartRule.seq().front(), Result.first); - } - - while (!PendingStates.empty()) { - auto StateID = PendingStates.back(); - PendingStates.pop_back(); - for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) { - auto Insert = Builder.insert(Next.second); - if (Insert.second) // new state, insert to the pending queue. - PendingStates.push_back(Insert.first); - Builder.insertEdge(StateID, Insert.first, Next.first); - } - for (auto Recovery : availableRecovery(Builder.find(StateID), G)) - Builder.insertRecovery(StateID, Recovery.first, Recovery.second); - } - return std::move(Builder).build(); -} - -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp deleted file mode 100644 index 6a68f1489d57a..0000000000000 --- a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp +++ /dev/null @@ -1,79 +0,0 @@ -//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/grammar/LRTable.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/raw_ostream.h" - -namespace clang { -namespace pseudo { - -std::string LRTable::dumpStatistics() const { - return llvm::formatv(R"( -Statistics of the LR parsing table: - number of states: {0} - number of actions: shift={1} goto={2} reduce={3} - size of the table (bytes): {4} -)", - numStates(), Shifts.size(), Gotos.size(), Reduces.size(), - bytes()) - .str(); -} - -std::string LRTable::dumpForTests(const Grammar &G) const { - std::string Result; - llvm::raw_string_ostream OS(Result); - OS << "LRTable:\n"; - for (StateID S = 0; S < numStates(); ++S) { - OS << llvm::formatv("State {0}\n", S); - for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) { - SymbolID TokID = tokenSymbol(static_cast(Terminal)); - if (auto SS = getShiftState(S, TokID)) - OS.indent(4) << llvm::formatv("{0}: shift state {1}\n", - G.symbolName(TokID), SS); - } - for (RuleID R : getReduceRules(S)) { - SymbolID Target = G.lookupRule(R).Target; - std::vector Terminals; - for (unsigned Terminal = 0; Terminal < NumTerminals; ++Terminal) { - SymbolID TokID = tokenSymbol(static_cast(Terminal)); - if (canFollow(Target, TokID)) - Terminals.push_back(G.symbolName(TokID)); - } - OS.indent(4) << llvm::formatv("{0}: reduce by rule {1} '{2}'\n", - llvm::join(Terminals, " "), R, - G.dumpRule(R)); - } - for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size(); - ++NontermID) { - if (auto GS = getGoToState(S, NontermID)) { - OS.indent(4) << llvm::formatv("{0}: go to state {1}\n", - G.symbolName(NontermID), *GS); - } - } - } - return OS.str(); -} - -LRTable::StateID LRTable::getStartState(SymbolID Target) const { - assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!"); - auto It = llvm::partition_point( - StartStates, [Target](const std::pair &X) { - return X.first < Target; - }); - assert(It != StartStates.end() && It->first == Target && - "target symbol doesn't have a start state!"); - return It->second; -} - -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp deleted file mode 100644 index 387e1c54ee99b..0000000000000 --- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp +++ /dev/null @@ -1,121 +0,0 @@ -//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/grammar/Grammar.h" -#include "clang-pseudo/grammar/LRGraph.h" -#include "clang-pseudo/grammar/LRTable.h" -#include "clang/Basic/TokenKinds.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" -#include - -namespace clang { -namespace pseudo { - -LRTable LRTable::Builder::build() && { - assert(NumNonterminals != 0 && "Set NumNonterminals or init with grammar"); - LRTable Table; - - // Count number of states: every state has to be reachable somehow. - StateID MaxState = 0; - for (const auto &Entry : StartStates) - MaxState = std::max(MaxState, Entry.second); - for (const auto &Entry : Transition) - MaxState = std::max(MaxState, Entry.second); - unsigned NumStates = MaxState + 1; - - Table.StartStates = std::move(StartStates); - - // Compile the goto and shift actions into transition tables. - llvm::DenseMap Gotos; - llvm::DenseMap Shifts; - for (const auto &E : Transition) { - if (isToken(E.first.second)) - Shifts.try_emplace(shiftIndex(E.first.first, E.first.second, NumStates), - E.second); - else - Gotos.try_emplace(gotoIndex(E.first.first, E.first.second, NumStates), - E.second); - } - Table.Shifts = TransitionTable(Shifts, NumStates * NumTerminals); - Table.Gotos = TransitionTable(Gotos, NumStates * NumNonterminals); - - // Compile the follow sets into a bitmap. - Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size()); - for (SymbolID NT = 0; NT < FollowSets.size(); ++NT) - for (SymbolID Follow : FollowSets[NT]) - Table.FollowSets.set(NT * tok::NUM_TOKENS + symbolToToken(Follow)); - - // Store the reduce actions in a vector partitioned by state. - Table.ReduceOffset.reserve(NumStates + 1); - std::vector StateRules; - for (StateID S = 0; S < NumStates; ++S) { - Table.ReduceOffset.push_back(Table.Reduces.size()); - auto It = Reduce.find(S); - if (It == Reduce.end()) - continue; - Table.Reduces.insert(Table.Reduces.end(), It->second.begin(), - It->second.end()); - llvm::sort(Table.Reduces.begin() + Table.ReduceOffset.back(), - Table.Reduces.end()); - } - Table.ReduceOffset.push_back(Table.Reduces.size()); - - // Error recovery entries: sort (no dups already), and build offset lookup. - llvm::sort(Recoveries, [&](const auto &L, const auto &R) { - return std::tie(L.first, L.second.Result, L.second.Strategy) < - std::tie(R.first, R.second.Result, R.second.Strategy); - }); - Table.Recoveries.reserve(Recoveries.size()); - for (const auto &R : Recoveries) - Table.Recoveries.push_back({R.second.Strategy, R.second.Result}); - Table.RecoveryOffset = std::vector(NumStates + 1, 0); - unsigned SortedIndex = 0; - for (StateID State = 0; State < NumStates; ++State) { - Table.RecoveryOffset[State] = SortedIndex; - while (SortedIndex < Recoveries.size() && - Recoveries[SortedIndex].first == State) - SortedIndex++; - } - Table.RecoveryOffset[NumStates] = SortedIndex; - assert(SortedIndex == Recoveries.size()); - - return Table; -} - -LRTable LRTable::buildSLR(const Grammar &G) { - auto Graph = LRGraph::buildLR0(G); - Builder Build(G); - Build.StartStates = Graph.startStates(); - for (const auto &T : Graph.edges()) - Build.Transition.try_emplace({T.Src, T.Label}, T.Dst); - for (const auto &Entry : Graph.recoveries()) - Build.Recoveries.push_back( - {Entry.Src, Recovery{Entry.Strategy, Entry.Result}}); - Build.FollowSets = followSets(G); - assert(Graph.states().size() <= (1 << StateBits) && - "Graph states execceds the maximum limit!"); - // Add reduce actions. - for (StateID SID = 0; SID < Graph.states().size(); ++SID) { - for (const Item &I : Graph.states()[SID].Items) { - // If we've just parsed the start symbol, this means we successfully parse - // the input. We don't add the reduce action of `_ := start_symbol` in the - // LRTable (the GLR parser handles it specifically). - if (G.lookupRule(I.rule()).Target == G.underscore() && !I.hasNext()) - continue; - if (!I.hasNext()) - // If we've reached the end of a rule A := ..., then we can reduce if - // the next token is in the follow set of A. - Build.Reduce[SID].insert(I.rule()); - } - } - return std::move(Build).build(); -} - -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/test/CMakeLists.txt b/clang-tools-extra/pseudo/test/CMakeLists.txt index 712527f78140e..56694c4a9f5a6 100644 --- a/clang-tools-extra/pseudo/test/CMakeLists.txt +++ b/clang-tools-extra/pseudo/test/CMakeLists.txt @@ -1,6 +1,4 @@ set(CLANG_PSEUDO_TEST_DEPS - clang-pseudo - clang-pseudo-fuzzer ClangPseudoTests ) diff --git a/clang-tools-extra/pseudo/test/check-cxx-bnf.test b/clang-tools-extra/pseudo/test/check-cxx-bnf.test deleted file mode 100644 index b825ff32faa1c..0000000000000 --- a/clang-tools-extra/pseudo/test/check-cxx-bnf.test +++ /dev/null @@ -1,2 +0,0 @@ -// verify clang/lib/Tooling/Syntax/Pseudo/cxx/cxx.bnf -// RUN: clang-pseudo -grammar=%cxx-bnf-file diff --git a/clang-tools-extra/pseudo/test/crash/backslashes.c b/clang-tools-extra/pseudo/test/crash/backslashes.c deleted file mode 100644 index 4ca70c609a0e6..0000000000000 --- a/clang-tools-extra/pseudo/test/crash/backslashes.c +++ /dev/null @@ -1,4 +0,0 @@ -// We used to try to interpret these backslashes as UCNs. -// RUN: clang-pseudo -source=%s -print-tokens -\ -\ x diff --git a/clang-tools-extra/pseudo/test/cxx/capture-list.cpp b/clang-tools-extra/pseudo/test/cxx/capture-list.cpp deleted file mode 100644 index fde46e4f0e038..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/capture-list.cpp +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -// We loosely allow capture defaults in any position/multiple times. -auto lambda = [&, &foo, bar(x), =]{}; -// CHECK: lambda-introducer := [ capture-list ] -// CHECK-NEXT: ├─[ -// CHECK-NEXT: ├─capture-list -// CHECK-NEXT: │ ├─capture-list -// CHECK-NEXT: │ │ ├─capture-list -// CHECK-NEXT: │ │ │ ├─capture-list~& := tok[4] -// CHECK-NEXT: │ │ │ ├─, -// CHECK-NEXT: │ │ │ └─capture~simple-capture -// CHECK-NEXT: │ │ │ ├─& -// CHECK-NEXT: │ │ │ └─IDENTIFIER := tok[7] -// CHECK-NEXT: │ │ ├─, -// CHECK-NEXT: │ │ └─capture~init-capture -// CHECK-NEXT: │ │ ├─IDENTIFIER := tok[9] -// CHECK-NEXT: │ │ └─initializer := ( expression-list ) -// CHECK-NEXT: │ │ ├─( -// CHECK-NEXT: │ │ ├─expression-list~IDENTIFIER := tok[11] -// CHECK-NEXT: │ │ └─) -// CHECK-NEXT: │ ├─, -// CHECK-NEXT: │ └─capture~= -// CHECK-NEXT: └─] diff --git a/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp b/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp deleted file mode 100644 index ae74353c0a156..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -// Verify that the contextual-{final,override} rules are guarded conditionally, -// No ambiguous parsing for the virt-specifier. -class Foo { - void foo1() override; -// CHECK: virt-specifier-seq~IDENTIFIER := tok[7] - void foo2() final; -// CHECK: virt-specifier-seq~IDENTIFIER := tok[13] -}; diff --git a/clang-tools-extra/pseudo/test/cxx/dangling-else.cpp b/clang-tools-extra/pseudo/test/cxx/dangling-else.cpp deleted file mode 100644 index 151f3931b53f9..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/dangling-else.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s - -// Verify the else should belong to the nested if statement -if (true) if (true) {} else {} - -// CHECK: statement-seq~selection-statement := IF ( condition ) statement -// CHECK-NEXT: ├─IF -// CHECK-NEXT: ├─( -// CHECK-NEXT: ├─condition~TRUE -// CHECK-NEXT: ├─) -// CHECK-NEXT: └─statement~selection-statement -// CHECK-NEXT: ├─IF -// CHECK-NEXT: ├─( -// CHECK-NEXT: ├─condition~TRUE -// CHECK-NEXT: ├─) -// CHECK-NEXT: ├─statement~compound-statement := { } -// CHECK-NEXT: │ ├─{ -// CHECK-NEXT: │ └─} -// CHECK-NEXT: ├─ELSE -// CHECK-NEXT: └─statement~compound-statement := { } -// CHECK-NEXT: ├─{ -// CHECK-NEXT: └─} diff --git a/clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp b/clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp deleted file mode 100644 index 255e8bedac497..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s - -// not parsed as Type{foo} Type{bar} -foo bar; -// CHECK-NOT: simple-declaration := decl-specifier-seq ; -// CHECK: simple-declaration := decl-specifier-seq init-declarator-list ; -// CHECK: ├─decl-specifier-seq~simple-type-specifier -// CHECK: ├─init-declarator-list~IDENTIFIER -// CHECK: └─; -// CHECK-NOT: simple-declaration := decl-specifier-seq ; - -// not parsed as Type{std} Type{::string} Declarator{s}; -std::string s; -// CHECK-NOT: nested-name-specifier := :: -// CHECK: simple-declaration := decl-specifier-seq init-declarator-list ; -// CHECK: ├─decl-specifier-seq~simple-type-specifier := -// CHECK: │ ├─simple-type-specifier := nested-name-specifier type-name -// CHECK: │ │ ├─nested-name-specifier := #1 -// CHECK: │ │ │ ├─nested-name-specifier := type-name :: -// CHECK: │ │ │ └─nested-name-specifier := namespace-name :: -// CHECK: │ │ └─type-name -// CHECK: │ └─simple-type-specifier := nested-name-specifier template-name -// CHECK: │ ├─nested-name-specifier =#1 -// CHECK: │ └─template-name~IDENTIFIER -// CHECK: ├─init-declarator-list~IDENTIFIER -// CHECK: └─; -// CHECK-NOT: nested-name-specifier := :: diff --git a/clang-tools-extra/pseudo/test/cxx/declarator-function.cpp b/clang-tools-extra/pseudo/test/cxx/declarator-function.cpp deleted file mode 100644 index 4d7972807c6db..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/declarator-function.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// The standard grammar allows an init-list with any declarator, including -// a function declarator. This creates an ambiguity where a function-definition -// is misparsed as a simple-declaration. - -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -void s(){}; -// CHECK-NOT: simple-declaration -// CHECK: function-definition := decl-specifier-seq function-declarator function-body -// CHECK-NOT: simple-declaration diff --git a/clang-tools-extra/pseudo/test/cxx/declarator-var.cpp b/clang-tools-extra/pseudo/test/cxx/declarator-var.cpp deleted file mode 100644 index 5aedd8037513f..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/declarator-var.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// The standard grammar allows an function-body to use any declarator, including -// a non-function declarator. This creates an ambiguity where a -// simple-declaration is misparsed as a function-definition. - -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -void (*s)(){}; -// CHECK-NOT: function-definition -// CHECK: init-declarator := non-function-declarator initializer -// CHECK-NOT: function-definition diff --git a/clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp b/clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp deleted file mode 100644 index 58d0ff4ccae9a..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s - -// Similiar to declarator-function.cpp, but for member functions. -class Foo { - void foo() {}; -// CHECK-NOT: member-declarator := declarator brace-or-equal-initializer -// CHECK: member-declaration~function-definition := decl-specifier-seq function-declarator function-body -// CHECK-NOT: member-declarator := declarator brace-or-equal-initializer -}; diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp deleted file mode 100644 index 2540dd010fcef..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp +++ /dev/null @@ -1,7 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest --forest-abbrev=false | FileCheck %s -class A { - ; -// CHECK-NOT: member-declaration := ; -// CHECK: member-declaration := empty-declaration -// CHECK-NOT: member-declaration := ; -}; diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp deleted file mode 100644 index 4d15835565b7e..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -class Foo { -public: -}; -// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=Brackets] } -// CHECK-NEXT: ├─class-head := class-key class-head-name -// CHECK-NEXT: │ ├─class-key~CLASS := tok[0] -// CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1] -// CHECK-NEXT: ├─{ := tok[2] -// CHECK-NEXT: ├─member-specification := access-specifier : -// CHECK-NEXT: │ ├─access-specifier~PUBLIC := tok[3] -// CHECK-NEXT: │ └─: := tok[4] -// CHECK-NEXT: └─} := tok[5] diff --git a/clang-tools-extra/pseudo/test/cxx/keyword.cpp b/clang-tools-extra/pseudo/test/cxx/keyword.cpp deleted file mode 100644 index 318db4ccc49b9..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/keyword.cpp +++ /dev/null @@ -1,12 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -bool operator<(); -// CHECK: translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ; -// CHECK-NEXT: ├─decl-specifier-seq~BOOL -// CHECK-NEXT: ├─init-declarator-list~noptr-declarator := noptr-declarator parameters-and-qualifiers -// CHECK-NEXT: │ ├─noptr-declarator~operator-function-id := OPERATOR operator-name -// CHECK-NEXT: │ │ ├─OPERATOR -// CHECK-NEXT: │ │ └─operator-name~< -// CHECK-NEXT: │ └─parameters-and-qualifiers := ( ) -// CHECK-NEXT: │ ├─( -// CHECK-NEXT: │ └─) -// CHECK-NEXT: └─; diff --git a/clang-tools-extra/pseudo/test/cxx/literals.cpp b/clang-tools-extra/pseudo/test/cxx/literals.cpp deleted file mode 100644 index e1cec8985b25f..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/literals.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest -forest-abbrev=0 | FileCheck %s --implicit-check-not=ambiguous -auto list = { - 0, // CHECK: := integer-literal - 0b1011, // CHECK: := integer-literal - 0777, // CHECK: := integer-literal - 42_u, // CHECK: := user-defined-integer-literal - 0LL, // CHECK: := integer-literal - 0h, // CHECK: := user-defined-integer-literal - 0., // CHECK: := floating-point-literal - .2, // CHECK: := floating-point-literal - 2e1, // CHECK: := floating-point-literal - 0x42d, // CHECK: := integer-literal - 0x42_d, // CHECK: := user-defined-integer-literal - 0x42ds, // CHECK: := user-defined-integer-literal - 0x1.2p2,// CHECK: := floating-point-literal - - "", // CHECK: literal := string-literal - L"", // CHECK: literal := string-literal - u8"", // CHECK: literal := string-literal - u"", // CHECK: literal := string-literal - U"", // CHECK: literal := string-literal - R"()", // CHECK: literal := string-literal - uR"()", // CHECK: literal := string-literal - "a" "b", // CHECK: literal := string-literal - u8"a" "b", // CHECK: literal := string-literal - u"a" u"b", // CHECK: literal := string-literal - "a"_u "b", // CHECK: user-defined-literal := user-defined-string-literal - "a"_u u"b", // CHECK: user-defined-literal := user-defined-string-literal - R"(a)" "\n", // CHECK: literal := string-literal - R"c(a)c"_u u"\n", // CHECK: user-defined-literal := user-defined-string-literal - - 'a', // CHECK: := character-literal - 'abc', // CHECK: := character-literal - 'abcdef', // CHECK: := character-literal - u'a', // CHECK: := character-literal - U'a', // CHECK: := character-literal - L'a', // CHECK: := character-literal - L'abc', // CHECK: := character-literal - U'\u1234',// CHECK: := character-literal - '\u1234', // CHECK: := character-literal - u'a'_u, // CHECK: := user-defined-character-literal -}; - diff --git a/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp b/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp deleted file mode 100644 index d605a3d66a5de..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -// FIXME: tighten CHECK to CHECK-NEXT once numeric literals are unambiguous. -auto x = { 1, .f = 2, [c]{3} }; -// CHECK: initializer-clause~braced-init-list -// CHECK-NEXT: ├─{ := tok[3] -// CHECK-NEXT: ├─initializer-list -// CHECK-NEXT: │ ├─initializer-list -// CHECK-NEXT: │ │ ├─initializer-list~NUMERIC_CONSTANT -// CHECK-NEXT: │ │ ├─, := tok[5] -// CHECK-NEXT: │ │ └─initializer-list-item -// CHECK-NEXT: │ │ ├─designator -// CHECK-NEXT: │ │ │ ├─. := tok[6] -// CHECK-NEXT: │ │ │ └─IDENTIFIER := tok[7] -// CHECK-NEXT: │ │ └─brace-or-equal-initializer -// CHECK-NEXT: │ │ ├─= := tok[8] -// CHECK-NEXT: │ │ └─initializer-clause~NUMERIC_CONSTANT -// CHECK-NEXT: │ ├─, := tok[10] -// CHECK-NEXT: │ └─initializer-list-item -// CHECK-NEXT: │ ├─designator -// CHECK-NEXT: │ │ ├─[ := tok[11] -// CHECK-NEXT: │ │ ├─expression~IDENTIFIER := tok[12] -// CHECK-NEXT: │ │ └─] := tok[13] -// CHECK-NEXT: │ └─brace-or-equal-initializer~braced-init-list -// CHECK-NEXT: │ ├─{ := tok[14] -// CHECK-NEXT: │ ├─initializer-list~NUMERIC_CONSTANT -// CHECK: │ └─} := tok[16] -// CHECK-NEXT: └─} := tok[17] diff --git a/clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp b/clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp deleted file mode 100644 index 41d0fa13ff6dd..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s - -// Verify that we don't form a complete `::` nested-name-specifier if there is -// an identifier preceding it. -Foo::Foo() {} // No "Foo ::Foo()" false parse -// CHECK: ├─declaration-seq~function-definition := function-declarator function-body -// CHECK-NEXT: │ ├─function-declarator~noptr-declarator := noptr-declarator parameters-and-qualifiers - -int ::x; -// CHECK: declaration~simple-declaration := decl-specifier-seq init-declarator-list ; -// CHECK-NEXT: ├─decl-specifier-seq~INT - -void test() { - X::Y::Z; // No false qualified-declarator parses "X ::Y::Z" and "X::Y ::Z". -// CHECK: statement-seq~statement := -// CHECK: statement~expression-statement := expression ; -// CHECK: statement~simple-declaration := decl-specifier-seq ; -// CHECK-NOT: simple-declaration := decl-specifier-seq init-declarator-list ; - - // FIXME: eliminate the false `a ::c` declaration parse. - a::c; -// CHECK: statement := -// CHECK-NEXT: ├─statement~expression-statement := expression ; -// CHECK-NEXT: │ ├─expression~relational-expression := -// CHECK: └─statement~simple-declaration := -// CHECK-NEXT: ├─simple-declaration := decl-specifier-seq ; -// CHECK: └─simple-declaration := decl-specifier-seq init-declarator-list ; -} diff --git a/clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp b/clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp deleted file mode 100644 index 1426f4e0a9bc0..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp +++ /dev/null @@ -1,14 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -void foo2(int, ...); -// CHECK: translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ; -// CHECK-NEXT: ├─decl-specifier-seq~VOID := -// CHECK-NEXT: ├─init-declarator-list~noptr-declarator := noptr-declarator parameters-and-qualifiers -// CHECK-NEXT: │ ├─noptr-declarator~IDENTIFIER := -// CHECK-NEXT: │ └─parameters-and-qualifiers := ( parameter-declaration-clause [recover=Brackets] ) -// CHECK-NEXT: │ ├─( := -// CHECK-NEXT: │ ├─parameter-declaration-clause := parameter-declaration-list , ... -// CHECK-NEXT: │ │ ├─parameter-declaration-list~INT := -// CHECK-NEXT: │ │ ├─, := -// CHECK-NEXT: │ │ └─... := -// CHECK-NEXT: │ └─) := -// CHECK-NEXT: └─; := diff --git a/clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp b/clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp deleted file mode 100644 index 5d48a3a43d027..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -void s() { - __func__; - // CHECK: expression~__FUNC__ := tok[5] -} diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp deleted file mode 100644 index 0b41f881fa3bf..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -void foo(complete garbage???) {} -// CHECK: translation-unit~function-definition := decl-specifier-seq function-declarator function-body -// CHECK-NEXT: ├─decl-specifier-seq~VOID := tok[0] -// CHECK-NEXT: ├─function-declarator~noptr-declarator := noptr-declarator parameters-and-qualifiers -// CHECK-NEXT: │ ├─noptr-declarator~IDENTIFIER := tok[1] -// CHECK-NEXT: │ └─parameters-and-qualifiers := ( parameter-declaration-clause [recover=Brackets] ) -// CHECK-NEXT: │ ├─( := tok[2] -// CHECK-NEXT: │ ├─parameter-declaration-clause := -// CHECK-NEXT: │ └─) := tok[8] -// CHECK-NEXT: └─function-body~compound-statement := { } -// CHECK-NEXT: ├─{ := tok[9] -// CHECK-NEXT: └─} := tok[10] diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp deleted file mode 100644 index 38216ad964772..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -auto x = { complete garbage }; -// CHECK: translation-unit~simple-declaration -// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0] -// CHECK-NEXT: ├─init-declarator-list~init-declarator -// CHECK-NEXT: │ ├─non-function-declarator~IDENTIFIER := tok[1] -// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer -// CHECK-NEXT: │ ├─= := tok[2] -// CHECK-NEXT: │ └─initializer-clause~braced-init-list -// CHECK-NEXT: │ ├─{ := tok[3] -// CHECK-NEXT: │ ├─initializer-list := -// CHECK-NEXT: │ └─} := tok[6] -// CHECK-NEXT: └─; := tok[7] diff --git a/clang-tools-extra/pseudo/test/cxx/structured-binding.cpp b/clang-tools-extra/pseudo/test/cxx/structured-binding.cpp deleted file mode 100644 index 1c68e928ddd62..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/structured-binding.cpp +++ /dev/null @@ -1,6 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s - -// Verify there is no false parse of the structured binding declaration. -ABC[post] = abc; -// CHECK: statement-seq~expression-statement := expression ; -// CHECK: postfix-expression [ expr-or-braced-init-list ] diff --git a/clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp b/clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp deleted file mode 100644 index 02aff285f838c..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp +++ /dev/null @@ -1,3 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -template struct MatchParents; -// CHECK: template-parameter-list~TYPENAME := tok[2] diff --git a/clang-tools-extra/pseudo/test/cxx/unsized-array.cpp b/clang-tools-extra/pseudo/test/cxx/unsized-array.cpp deleted file mode 100644 index 1f7b106e0e93b..0000000000000 --- a/clang-tools-extra/pseudo/test/cxx/unsized-array.cpp +++ /dev/null @@ -1,7 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s -void s(int[]); -// CHECK: parameter-declaration-clause~parameter-declaration := decl-specifier-seq abstract-declarator -// CHECK-NEXT: ├─decl-specifier-seq~INT := tok[3] -// CHECK-NEXT: └─abstract-declarator~noptr-abstract-declarator := [ ] -// CHECK-NEXT: ├─[ := tok[4] -// CHECK-NEXT: └─] := tok[5] diff --git a/clang-tools-extra/pseudo/test/fuzzer.cpp b/clang-tools-extra/pseudo/test/fuzzer.cpp deleted file mode 100644 index 400746a9d12d5..0000000000000 --- a/clang-tools-extra/pseudo/test/fuzzer.cpp +++ /dev/null @@ -1,4 +0,0 @@ -// RUN: clang-pseudo-fuzzer -grammar=%cxx-bnf-file -print %s | FileCheck %s -int x; -// CHECK: translation-unit := declaration-seq -// CHECK: builtin-type := INT diff --git a/clang-tools-extra/pseudo/test/glr-variant-start.cpp b/clang-tools-extra/pseudo/test/glr-variant-start.cpp deleted file mode 100644 index 1bd073707353b..0000000000000 --- a/clang-tools-extra/pseudo/test/glr-variant-start.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s - -a + a; -// CHECK: statement-seq~expression-statement := expression ; -// CHECK-NEXT: ├─expression~additive-expression := additive-expression + multiplicative-expression -// CHECK-NEXT: │ ├─additive-expression~IDENTIFIER := -// CHECK-NEXT: │ ├─+ := -// CHECK-NEXT: │ └─multiplicative-expression~IDENTIFIER := -// CHECK-NEXT: └─; := diff --git a/clang-tools-extra/pseudo/test/glr.cpp b/clang-tools-extra/pseudo/test/glr.cpp deleted file mode 100644 index f805e42ffa6dd..0000000000000 --- a/clang-tools-extra/pseudo/test/glr.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest -print-statistics | FileCheck %s - -void foo() { - T* a; // a multiply expression or a pointer declaration? -// CHECK: statement-seq~statement := -// CHECK-NEXT: ├─statement~expression-statement := expression ; -// CHECK-NEXT: │ ├─expression~multiplicative-expression := multiplicative-expression * pm-expression -// CHECK-NEXT: │ │ ├─multiplicative-expression~IDENTIFIER := tok[5] -// CHECK-NEXT: │ │ ├─* := tok[6] -// CHECK-NEXT: │ │ └─pm-expression~id-expression := unqualified-id #1 -// CHECK-NEXT: │ │ └─unqualified-id~IDENTIFIER := tok[7] -// CHECK-NEXT: │ └─; := tok[8] -// CHECK-NEXT: └─statement~simple-declaration := decl-specifier-seq init-declarator-list ; -// CHECK-NEXT: ├─decl-specifier-seq~simple-type-specifier := -// CHECK-NEXT: │ ├─simple-type-specifier~IDENTIFIER := tok[5] -// CHECK-NEXT: │ └─simple-type-specifier~IDENTIFIER := tok[5] -// CHECK-NEXT: ├─init-declarator-list~ptr-declarator := ptr-operator ptr-declarator -// CHECK-NEXT: │ ├─ptr-operator~* := tok[6] -// CHECK-NEXT: │ └─ptr-declarator~id-expression =#1 -// CHECK-NEXT: └─; := tok[8] -} - -// CHECK: 2 Ambiguous nodes: -// CHECK-NEXT: 1 simple-type-specifier -// CHECK-NEXT: 1 statement -// CHECK-EMPTY: -// CHECK-NEXT: 0 Opaque nodes: -// CHECK-EMPTY: -// CHECK-NEXT: Ambiguity: 0.20 misparses/token -// CHECK-NEXT: Unparsed: 0.00% diff --git a/clang-tools-extra/pseudo/test/html-forest.c b/clang-tools-extra/pseudo/test/html-forest.c deleted file mode 100644 index 0be08da49f4a7..0000000000000 --- a/clang-tools-extra/pseudo/test/html-forest.c +++ /dev/null @@ -1,8 +0,0 @@ -// RUN: clang-pseudo -source %s -html-forest=%t.html -// RUN: FileCheck %s < %t.html -int main() { -} -// Sanity check for some obvious strings. -// CHECK-DAG: -// CHECK-DAG: "compound-statement" -// CHECK-DAG: main diff --git a/clang-tools-extra/pseudo/test/lex.c b/clang-tools-extra/pseudo/test/lex.c deleted file mode 100644 index ebebd2e0fb72f..0000000000000 --- a/clang-tools-extra/pseudo/test/lex.c +++ /dev/null @@ -1,42 +0,0 @@ -int is_debug() { -#ifndef NDEBUG - return 1; // in debug mode -#else - return 0; -#endif -} - -/* This comment gets lexed along with the input above! We just don't CHECK it. - -RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace - SOURCE: int is_debug() { -SOURCE-NEXT: #ifndef NDEBUG -SOURCE-NEXT: return 1; // in debug mode -SOURCE-NEXT: #else -SOURCE-NEXT: return 0; -SOURCE-NEXT: #end -SOURCE-NEXT: } - -RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN - TOKEN: 0: raw_identifier 0:0 "int" flags=1 -TOKEN-NEXT: raw_identifier 0:0 "is_debug" -TOKEN-NEXT: l_paren 0:0 "(" -TOKEN-NEXT: r_paren 0:0 ")" -TOKEN-NEXT: l_brace 0:0 "{" -TOKEN-NEXT: hash 1:0 "#" flags=1 -TOKEN-NEXT: raw_identifier 1:0 "ifndef" -TOKEN-NEXT: raw_identifier 1:0 "NDEBUG" -TOKEN-NEXT: raw_identifier 2:2 "return" flags=1 -TOKEN-NEXT: numeric_constant 2:2 "1" -TOKEN-NEXT: semi 2:2 ";" -TOKEN-NEXT: comment 2:2 "// in debug mode" -TOKEN-NEXT: hash 3:0 "#" flags=1 -TOKEN-NEXT: raw_identifier 3:0 "else" -TOKEN-NEXT: raw_identifier 4:2 "return" flags=1 -TOKEN-NEXT: numeric_constant 4:2 "0" -TOKEN-NEXT: semi 4:2 ";" -TOKEN-NEXT: hash 5:0 "#" flags=1 -TOKEN-NEXT: raw_identifier 5:0 "endif" -TOKEN-NEXT: r_brace 6:0 "}" flags=1 - -*******************************************************************************/ diff --git a/clang-tools-extra/pseudo/test/lr-build-basic.test b/clang-tools-extra/pseudo/test/lr-build-basic.test deleted file mode 100644 index 13036349eb8c1..0000000000000 --- a/clang-tools-extra/pseudo/test/lr-build-basic.test +++ /dev/null @@ -1,32 +0,0 @@ -_ := expr EOF -expr := id -id := IDENTIFIER - -# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH -# GRAPH: States: -# GRAPH-NEXT: State 0 -# GRAPH-NEXT: _ := • expr EOF -# GRAPH-NEXT: expr := • id -# GRAPH-NEXT: id := • IDENTIFIER -# GRAPH-NEXT: State 1 -# GRAPH-NEXT: _ := expr • EOF -# GRAPH-NEXT: State 2 -# GRAPH-NEXT: expr := id • -# GRAPH-NEXT: State 3 -# GRAPH-NEXT: id := IDENTIFIER • -# GRAPH-NEXT: State 4 -# GRAPH-NEXT: _ := expr EOF • - -# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE -# TABLE: LRTable: -# TABLE-NEXT: State 0 -# TABLE-NEXT: IDENTIFIER: shift state 3 -# TABLE-NEXT: expr: go to state 1 -# TABLE-NEXT: id: go to state 2 -# TABLE-NEXT: State 1 -# TABLE-NEXT: EOF: shift state 4 -# TABLE-NEXT: State 2 -# TABLE-NEXT: EOF: reduce by rule 2 'expr := id' -# TABLE-NEXT: State 3 -# TABLE-NEXT: EOF: reduce by rule 1 'id := IDENTIFIER' -# TABLE-NEXT: State 4 diff --git a/clang-tools-extra/pseudo/test/lr-build-conflicts.test b/clang-tools-extra/pseudo/test/lr-build-conflicts.test deleted file mode 100644 index a66ce4d622ca1..0000000000000 --- a/clang-tools-extra/pseudo/test/lr-build-conflicts.test +++ /dev/null @@ -1,49 +0,0 @@ -_ := expr EOF -expr := expr - expr # S/R conflict at state 4 on '-' token -expr := IDENTIFIER - -# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH -# GRAPH: States -# GRAPH-NEXT: State 0 -# GRAPH-NEXT: _ := • expr EOF -# GRAPH-NEXT: expr := • expr - expr -# GRAPH-NEXT: expr := • IDENTIFIER -# GRAPH-NEXT: State 1 -# GRAPH-NEXT: _ := expr • EOF -# GRAPH-NEXT: expr := expr • - expr -# GRAPH-NEXT: State 2 -# GRAPH-NEXT: expr := IDENTIFIER • -# GRAPH-NEXT: State 3 -# GRAPH-NEXT: _ := expr EOF • -# GRAPH-NEXT: State 4 -# GRAPH-NEXT: expr := • expr - expr -# GRAPH-NEXT: expr := expr - • expr -# GRAPH-NEXT: expr := • IDENTIFIER -# GRAPH-NEXT: State 5 -# GRAPH-NEXT: expr := expr - expr • -# GRAPH-NEXT: expr := expr • - expr -# GRAPH-NEXT: 0 ->[expr] 1 -# GRAPH-NEXT: 0 ->[IDENTIFIER] 2 -# GRAPH-NEXT: 1 ->[EOF] 3 -# GRAPH-NEXT: 1 ->[-] 4 -# GRAPH-NEXT: 4 ->[expr] 5 -# GRAPH-NEXT: 4 ->[IDENTIFIER] 2 -# GRAPH-NEXT: 5 ->[-] 4 - -# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE -# TABLE: LRTable: -# TABLE-NEXT: State 0 -# TABLE-NEXT: IDENTIFIER: shift state 2 -# TABLE-NEXT: expr: go to state 1 -# TABLE-NEXT: State 1 -# TABLE-NEXT: EOF: shift state 3 -# TABLE-NEXT: -: shift state 4 -# TABLE-NEXT: State 2 -# TABLE-NEXT: EOF -: reduce by rule 2 'expr := IDENTIFIER' -# TABLE-NEXT: State 3 -# TABLE-NEXT: State 4 -# TABLE-NEXT: IDENTIFIER: shift state 2 -# TABLE-NEXT: expr: go to state 5 -# TABLE-NEXT: State 5 -# TABLE-NEXT: -: shift state 4 -# TABLE-NEXT: EOF -: reduce by rule 1 'expr := expr - expr' diff --git a/clang-tools-extra/pseudo/test/strip-directives.c b/clang-tools-extra/pseudo/test/strip-directives.c deleted file mode 100644 index c7878d9295a08..0000000000000 --- a/clang-tools-extra/pseudo/test/strip-directives.c +++ /dev/null @@ -1,49 +0,0 @@ -#include -int main() { -#error This was inevitable... -#if HELLO - printf("hello, world\n"); - return 0; -#else - abort(); -#endif -} - -/* This comment gets lexed along with the input above! We just don't CHECK it. - -RUN: clang-pseudo -source %s -print-directive-tree | FileCheck %s -check-prefix=PPT --strict-whitespace - PPT: #include (7 tokens) -PPT-NEXT: code (5 tokens) -PPT-NEXT: #error (6 tokens) -PPT-NEXT: #if (3 tokens) TAKEN -PPT-NEXT: code (8 tokens) -PPT-NEXT: #else (2 tokens) -PPT-NEXT: code (4 tokens) -PPT-NEXT: #endif (2 tokens) -PPT-NEXT: code (2 tokens) - ^ including this block comment - -RUN: clang-pseudo -source %s -strip-directives -print-source | FileCheck %s --strict-whitespace - CHECK: int main() { -CHECK-NEXT: printf("hello, world\n"); -CHECK-NEXT: return 0; -CHECK-NEXT: } - -RUN: clang-pseudo -source %s -strip-directives -print-tokens | FileCheck %s --check-prefix=TOKEN - TOKEN: 0: raw_identifier 1:0 "int" flags=1 -TOKEN-NEXT: raw_identifier 1:0 "main" -TOKEN-NEXT: l_paren 1:0 "(" -TOKEN-NEXT: r_paren 1:0 ")" -TOKEN-NEXT: l_brace 1:0 "{" -TOKEN-NEXT: raw_identifier 4:2 "printf" flags=1 -TOKEN-NEXT: l_paren 4:2 "(" -TOKEN-NEXT: string_literal 4:2 "\22hello, world\\n\22" -TOKEN-NEXT: r_paren 4:2 ")" -TOKEN-NEXT: semi 4:2 ";" -TOKEN-NEXT: raw_identifier 5:2 "return" flags=1 -TOKEN-NEXT: numeric_constant 5:2 "0" -TOKEN-NEXT: semi 5:2 ";" -TOKEN-NEXT: r_brace 9:0 "}" flags=1 - -*******************************************************************************/ - diff --git a/clang-tools-extra/pseudo/tool/CMakeLists.txt b/clang-tools-extra/pseudo/tool/CMakeLists.txt deleted file mode 100644 index 49e1dc29a5a4e..0000000000000 --- a/clang-tools-extra/pseudo/tool/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -set(LLVM_LINK_COMPONENTS support) - -add_clang_tool(clang-pseudo - ClangPseudo.cpp - HTMLForest.cpp - ) - -clang_target_link_libraries(clang-pseudo - PRIVATE - clangBasic - ) - -target_link_libraries(clang-pseudo - PRIVATE - clangPseudo - clangPseudoGrammar - clangPseudoCLI - ) - -add_custom_command(OUTPUT HTMLForestResources.inc - COMMAND "${Python3_EXECUTABLE}" ${CLANG_SOURCE_DIR}/utils/bundle_resources.py - ${CMAKE_CURRENT_BINARY_DIR}/HTMLForestResources.inc - HTMLForest.css HTMLForest.js HTMLForest.html - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - COMMENT "Bundling HTMLForest resources" - DEPENDS ${CLANG_SOURCE_DIR}/utils/bundle_resources.py HTMLForest.css HTMLForest.js HTMLForest.html - VERBATIM) -add_custom_target(clang-pseudo-resources DEPENDS HTMLForestResources.inc) -add_dependencies(clang-pseudo clang-pseudo-resources) diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp deleted file mode 100644 index 6a64760749cef..0000000000000 --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ /dev/null @@ -1,243 +0,0 @@ -//===-- ClangPseudo.cpp - Clang pseudoparser tool -------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/Bracket.h" -#include "clang-pseudo/DirectiveTree.h" -#include "clang-pseudo/Disambiguate.h" -#include "clang-pseudo/Forest.h" -#include "clang-pseudo/GLR.h" -#include "clang-pseudo/Language.h" -#include "clang-pseudo/Token.h" -#include "clang-pseudo/cli/CLI.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "clang-pseudo/grammar/LRGraph.h" -#include "clang-pseudo/grammar/LRTable.h" -#include "clang/Basic/LangOptions.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/STLFunctionalExtras.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Signals.h" -#include - -using clang::pseudo::ForestNode; -using clang::pseudo::Token; -using clang::pseudo::TokenStream; -using llvm::cl::desc; -using llvm::cl::init; -using llvm::cl::opt; - -static opt PrintGrammar("print-grammar", desc("Print the grammar")); -static opt PrintGraph("print-graph", - desc("Print the LR graph for the grammar")); -static opt PrintTable("print-table", - desc("Print the LR table for the grammar")); -static opt Source("source", desc("Source file")); -static opt PrintSource("print-source", desc("Print token stream")); -static opt PrintTokens("print-tokens", desc("Print detailed token info")); -static opt - PrintDirectiveTree("print-directive-tree", - desc("Print directive structure of source code")); -static opt - StripDirectives("strip-directives", - desc("Strip directives and select conditional sections")); -static opt Disambiguate("disambiguate", - desc("Choose best tree from parse forest")); -static opt PrintStatistics("print-statistics", desc("Print GLR parser statistics")); -static opt PrintForest("print-forest", desc("Print parse forest")); -static opt ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"), - init(true)); -static opt HTMLForest("html-forest", - desc("output file for HTML forest")); -static opt StartSymbol("start-symbol", - desc("Specify the start symbol to parse"), - init("translation-unit")); - -static std::string readOrDie(llvm::StringRef Path) { - llvm::ErrorOr> Text = - llvm::MemoryBuffer::getFile(Path); - if (std::error_code EC = Text.getError()) { - llvm::errs() << "Error: can't read file '" << Path - << "': " << EC.message() << "\n"; - ::exit(1); - } - return Text.get()->getBuffer().str(); -} - -namespace clang { -namespace pseudo { -// Defined in HTMLForest.cpp -void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &, - const ForestNode &Root, const Disambiguation &, - const TokenStream &); -namespace { - -struct NodeStats { - unsigned Total = 0; - std::vector> BySymbol; - - NodeStats(const ForestNode &Root, - llvm::function_ref Filter) { - llvm::DenseMap Map; - for (const ForestNode &N : Root.descendants()) - if (Filter(N)) { - ++Total; - ++Map[N.symbol()]; - } - BySymbol = {Map.begin(), Map.end()}; - // Sort by count descending, then symbol ascending. - llvm::sort(BySymbol, [](const auto &L, const auto &R) { - return std::tie(R.second, L.first) < std::tie(L.second, R.first); - }); - } -}; - -} // namespace -} // namespace pseudo -} // namespace clang - -int main(int argc, char *argv[]) { - llvm::cl::ParseCommandLineOptions(argc, argv, ""); - llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); - - clang::LangOptions LangOpts = clang::pseudo::genericLangOpts(); - std::string SourceText; - std::optional RawStream; - std::optional PreprocessedStream; - std::optional ParseableStream; - if (Source.getNumOccurrences()) { - SourceText = readOrDie(Source); - RawStream = clang::pseudo::lex(SourceText, LangOpts); - TokenStream *Stream = &*RawStream; - - auto DirectiveStructure = clang::pseudo::DirectiveTree::parse(*RawStream); - clang::pseudo::chooseConditionalBranches(DirectiveStructure, *RawStream); - - std::optional Preprocessed; - if (StripDirectives) { - Preprocessed = DirectiveStructure.stripDirectives(*Stream); - Stream = &*Preprocessed; - } - - if (PrintSource) - Stream->print(llvm::outs()); - if (PrintTokens) - llvm::outs() << *Stream; - if (PrintDirectiveTree) - llvm::outs() << DirectiveStructure; - - ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts)); - pairBrackets(*ParseableStream); - } - - const auto &Lang = clang::pseudo::getLanguageFromFlags(); - if (PrintGrammar) - llvm::outs() << Lang.G.dump(); - if (PrintGraph) - llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang.G).dumpForTests( - Lang.G); - - if (PrintTable) - llvm::outs() << Lang.Table.dumpForTests(Lang.G); - if (PrintStatistics) - llvm::outs() << Lang.Table.dumpStatistics(); - - if (ParseableStream) { - clang::pseudo::ForestArena Arena; - clang::pseudo::GSS GSS; - std::optional StartSymID = - Lang.G.findNonterminal(StartSymbol); - if (!StartSymID) { - llvm::errs() << llvm::formatv( - "The start symbol {0} doesn't exit in the grammar!\n", StartSymbol); - return 2; - } - auto &Root = - glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS}, - *StartSymID, Lang); - // If we're disambiguating, we'll print at the end instead. - if (PrintForest && !Disambiguate) - llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev); - clang::pseudo::Disambiguation Disambig; - if (Disambiguate) - Disambig = clang::pseudo::disambiguate(&Root, {}); - - if (HTMLForest.getNumOccurrences()) { - std::error_code EC; - llvm::raw_fd_ostream HTMLOut(HTMLForest, EC); - if (EC) { - llvm::errs() << "Couldn't write " << HTMLForest << ": " << EC.message() - << "\n"; - return 2; - } - clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, Disambig, - *ParseableStream); - } - - if (PrintStatistics) { - llvm::outs() << "Forest bytes: " << Arena.bytes() - << " nodes: " << Arena.nodeCount() << "\n"; - llvm::outs() << "GSS bytes: " << GSS.bytes() - << " nodes: " << GSS.nodesCreated() << "\n"; - - for (auto &P : {std::make_pair("Ambiguous", ForestNode::Ambiguous), - std::make_pair("Opaque", ForestNode::Opaque)}) { - clang::pseudo::NodeStats Stats( - Root, [&](const auto &N) { return N.kind() == P.second; }); - llvm::outs() << "\n" << Stats.Total << " " << P.first << " nodes:\n"; - for (const auto &S : Stats.BySymbol) - llvm::outs() << llvm::formatv(" {0,3} {1}\n", S.second, - Lang.G.symbolName(S.first)); - } - - // Metrics for how imprecise parsing was. - // These are rough but aim to be: - // - linear: if we eliminate half the errors the metric should halve - // - length-independent - unsigned UnparsedTokens = 0; // Tokens covered by Opaque. (not unique) - unsigned Misparses = 0; // Sum of alternatives-1 - llvm::DenseSet Visited; - auto DFS = [&](const ForestNode &N, Token::Index End, auto &DFS) -> void { - if (N.kind() == ForestNode::Opaque) { - UnparsedTokens += End - N.startTokenIndex(); - } else if (N.kind() == ForestNode::Ambiguous) { - Misparses += N.alternatives().size() - 1; - for (const auto *C : N.alternatives()) - if (Visited.insert(C).second) - DFS(*C, End, DFS); - } else if (N.kind() == ForestNode::Sequence) { - for (unsigned I = 0, E = N.children().size(); I < E; ++I) - if (Visited.insert(N.children()[I]).second) - DFS(*N.children()[I], - I + 1 == N.children().size() - ? End - : N.children()[I + 1]->startTokenIndex(), - DFS); - } - }; - unsigned Len = ParseableStream->tokens().size(); - DFS(Root, Len, DFS); - llvm::outs() << "\n"; - llvm::outs() << llvm::formatv("Ambiguity: {0} misparses/token\n", - double(Misparses) / Len); - llvm::outs() << llvm::formatv("Unparsed: {0}%\n", - 100.0 * UnparsedTokens / Len); - } - - if (Disambiguate && PrintForest) { - ForestNode *DisambigRoot = &Root; - removeAmbiguities(DisambigRoot, Disambig); - llvm::outs() << "Disambiguated tree:\n"; - llvm::outs() << DisambigRoot->dumpRecursive(Lang.G, - /*Abbreviated=*/ForestAbbrev); - } - } - - return 0; -} diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.cpp b/clang-tools-extra/pseudo/tool/HTMLForest.cpp deleted file mode 100644 index 184430bddd8d6..0000000000000 --- a/clang-tools-extra/pseudo/tool/HTMLForest.cpp +++ /dev/null @@ -1,192 +0,0 @@ -//===-- HTMLForest.cpp - browser-based parse forest explorer -//---------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The plain text forest node dump (clang-pseudo -print-forest) is useful but -// hard to reconcile with the code being examined, especially when it is large. -// -// HTMLForest produces a self-contained HTML file containing both the code and -// the forest representation, linking them interactively with javascript. -// At any given time, a single parse tree is shown (ambiguities resolved). -// The user can switch between ambiguous alternatives. -// -// +-------+---------------+ -// | | +-----+| -// | #tree | #code |#info|| -// | | +-----+| -// | | | -// +-------+---------------+ -// -// #tree is a hierarchical view of the nodes (nested
    s), like -print-forest. -// (It is a simple tree, not a DAG, because ambiguities have been resolved). -// Like -print-forest, trivial sequences are collapsed (expression~IDENTIFIER). -// -// #code is the source code, annotated with s marking the node ranges. -// These spans are usually invisible (exception: ambiguities are marked), but -// they are used to show and change the selection. -// -// #info is a floating box that shows details of the currently selected node: -// - rule (for sequence nodes). Abbreviated rules are also shown. -// - alternatives (for ambiguous nodes). The user can choose an alternative. -// - ancestors. The parent nodes show how this node fits in translation-unit. -// -// There are two types of 'active' node: -// - *highlight* is what the cursor is over, and is colored blue. -// Near ancestors are shaded faintly (onion-skin) to show local structure. -// - *selection* is set by clicking. -// The #info box shows the selection, and selected nodes have a dashed ring. -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/Disambiguate.h" -#include "clang-pseudo/Forest.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/JSON.h" -#include "llvm/Support/raw_ostream.h" -namespace clang { -namespace pseudo { -namespace { - -// Defines const char HTMLForest_css[] = "...contents of HTMLForest.css..."; etc -#include "HTMLForestResources.inc" - -struct Writer { - llvm::raw_ostream &Out; - const Grammar &G; - const ForestNode &Root; - const TokenStream &Stream; - const Disambiguation &Disambig; - - void write() { - Out << "\n"; - tag("html", [&] { - tag("head", [&] { - tag("title", [&] { Out << "HTMLForest"; }); - tag("script", [&] { Out << HTMLForest_js; }); - tag("style", [&] { Out << HTMLForest_css; }); - tag("script", [&] { - Out << "var forest="; - writeForestJSON(); - Out << ";"; - }); - tag("pre id='hidden-code' hidden", [&] { writeCode(); }); - }); - tag("body", [&] { Out << HTMLForest_html; }); - }); - } - - void writeCode(); - void writeForestJSON(); - void tag(llvm::StringRef Opener, llvm::function_ref Body) { - Out << "<" << Opener << ">"; - Body(); - Out << "\n"; - } -}; - -void Writer::writeCode() { - // This loop (whitespace logic) is cribbed from TokenStream::Print. - bool FirstToken = true; - unsigned LastLine = -1; - StringRef LastText; - for (const auto &T : Stream.tokens()) { - StringRef Text = T.text(); - if (FirstToken) { - FirstToken = false; - } else if (T.Line == LastLine) { - if (LastText.data() + LastText.size() != Text.data()) - Out << ' '; - } else { - Out << " \n"; // Extra space aids selection. - Out.indent(T.Indent); - } - Out << ""; - llvm::printHTMLEscaped(Text, Out); - Out << ""; - LastLine = T.Line; - LastText = Text; - } - if (!FirstToken) - Out << '\n'; -} - -// Writes a JSON array of forest nodes. Items are e.g.: -// {kind:'sequence', symbol:'compound-stmt', children:[5,8,33], -// rule:'compound-stmt := ...'} {kind:'terminal', symbol:'VOID', token:'t52'} -// {kind:'ambiguous', symbol:'type-specifier', children:[3,100] selected:3} -// {kind:'opaque', symbol:'statement-seq', firstToken:'t5', lastToken:'t6'} -void Writer::writeForestJSON() { - // This is the flat array of nodes: the index into this array is the node ID. - std::vector> Sequence; - llvm::DenseMap Index; - auto AssignID = [&](const ForestNode *N, Token::Index End) -> unsigned { - auto R = Index.try_emplace(N, Sequence.size()); - if (R.second) - Sequence.push_back({N, End}); - return R.first->second; - }; - AssignID(&Root, Stream.tokens().size()); - auto TokenID = [](Token::Index I) { return ("t" + llvm::Twine(I)).str(); }; - - llvm::json::OStream Out(this->Out, 2); - Out.array([&] { - for (unsigned I = 0; I < Sequence.size(); ++I) { - const ForestNode *N = Sequence[I].first; - Token::Index End = Sequence[I].second; - Out.object([&] { - Out.attribute("symbol", G.symbolName(N->symbol())); - switch (N->kind()) { - case ForestNode::Terminal: - Out.attribute("kind", "terminal"); - Out.attribute("token", TokenID(N->startTokenIndex())); - break; - case ForestNode::Sequence: - Out.attribute("kind", "sequence"); - Out.attribute("rule", G.dumpRule(N->rule())); - break; - case ForestNode::Ambiguous: - Out.attribute("kind", "ambiguous"); - Out.attribute("selected", - AssignID(N->children()[Disambig.lookup(N)], End)); - break; - case ForestNode::Opaque: - Out.attribute("kind", "opaque"); - Out.attribute("firstToken", TokenID(N->startTokenIndex())); - // [firstToken, lastToken] is a closed range. - // If empty, lastToken is omitted. - if (N->startTokenIndex() != End) - Out.attribute("lastToken", TokenID(End - 1)); - break; - } - auto Children = N->children(); - if (!Children.empty()) - Out.attributeArray("children", [&] { - for (unsigned I = 0; I < Children.size(); ++I) - Out.value(AssignID(Children[I], - I + 1 == Children.size() - ? End - : Children[I + 1]->startTokenIndex())); - }); - }); - } - }); -} - -} // namespace - -// We only accept the derived stream here. -// FIXME: allow the original stream instead? -void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &G, - const ForestNode &Root, const Disambiguation &Disambig, - const TokenStream &Stream) { - Writer{OS, G, Root, Stream, Disambig}.write(); -} - -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.css b/clang-tools-extra/pseudo/tool/HTMLForest.css deleted file mode 100644 index 674cd59f0e76b..0000000000000 --- a/clang-tools-extra/pseudo/tool/HTMLForest.css +++ /dev/null @@ -1,93 +0,0 @@ -body { - position: absolute; - top: 0; - bottom: 0; - right: 0; - left: 0; - - display: flex; - align-items: stretch; - margin: 0; - font-family: sans-serif; - white-space: nowrap; - height: 100%; -} -body > * { - overflow-y: auto; /* Scroll sections independently*/ - margin: 0; -} - -#code { - font-size: 18px; - line-height: 36px; - flex-grow: 1; - padding-right: 10em; /* Leave space for #info */ -} -#code span { - padding: 9px 0; /* No "gaps" between lines due to line-height */ -} -.node.ambiguous::before, .ancestors.ambiguous::after, .tree-node.ambiguous > header::after { - content: /*the thinking man's emoji*/'\01F914'; -} - -#info { - position: fixed; - right: 2em; - top: 1em; - width: 25em; - border: 1px solid black; - min-height: 20em; - background-color: whiteSmoke; - overflow-x: clip; - box-shadow: 3px 3px 5px rgba(0,0,0,0.2); -} -#info header { - background-color: black; - color: white; - font-size: larger; - padding: 0.5em; -} -#info.ambiguous header { background-color: #803; } -#info.sequence header { background-color: darkBlue; } -#info.terminal header { background-color: darkGreen; } -#info.opaque header { background-color: orangeRed; } -#i_kind { - float: right; - font-size: small; -} -#info section { - padding: 0.5em; - border-top: 1px solid lightGray; - overflow-x: auto; -} -#i_ancestors { font-size: small; } - -#tree { - flex-grow: 0; - min-width: 20em; - margin-right: 1em; - border-right: 1px solid darkGray; - background-color: azure; - font-size: small; - overflow-x: auto; - resize: horizontal; -} -#tree ul { - margin: 0; - display: inline-block; - padding-left: 6px; - border-left: 1px solid rgba(0,0,0,0.2); - list-style: none; -} -#tree > ul { border-left: none; } -.tree-node.selected > header .name { font-weight: bold; } -.tree-node.terminal .name { font-family: monospace; } -.tree-node.ambiguous > header .name { color: #803; font-weight: bold; } -.tree-node.sequence > header .name { color: darkBlue; } -.tree-node.terminal > header .name { color: darkGreen; } -.tree-node.opaque > header .name { color: orangeRed; } - -.selected { outline: 1px dashed black; } -.abbrev { opacity: 50%; } -.abbrev::after { content: '~'; } -.opaque { background-color: bisque; } diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.html b/clang-tools-extra/pseudo/tool/HTMLForest.html deleted file mode 100644 index 4cf98cbbb2cc9..0000000000000 --- a/clang-tools-extra/pseudo/tool/HTMLForest.html +++ /dev/null @@ -1,15 +0,0 @@ -
      -
      
      -
      diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.js b/clang-tools-extra/pseudo/tool/HTMLForest.js
      deleted file mode 100644
      index 24b88a5c10b47..0000000000000
      --- a/clang-tools-extra/pseudo/tool/HTMLForest.js
      +++ /dev/null
      @@ -1,290 +0,0 @@
      -// The global map of forest node index => NodeView.
      -views = [];
      -// NodeView is a visible forest node.
      -// It has an entry in the navigation tree, and a span in the code itself.
      -// Each NodeView is associated with a forest node, but not all nodes have views:
      -// - nodes not reachable though current ambiguity selection
      -// - trivial "wrapping" sequence nodes are abbreviated away
      -class NodeView {
      -  // Builds a node representing forest[index], or its target if it is a wrapper.
      -  // Registers the node in the global map.
      -  static make(index, parent, abbrev) {
      -    var node = forest[index];
      -    if (node.kind == 'sequence' && node.children.length == 1 &&
      -        forest[node.children[0]].kind != 'ambiguous') {
      -      abbrev ||= [];
      -      abbrev.push(index);
      -      return NodeView.make(node.children[0], parent, abbrev);
      -    }
      -    return views[index] = new NodeView(index, parent, node, abbrev);
      -  }
      -
      -  constructor(index, parent, node, abbrev) {
      -    this.abbrev = abbrev || [];
      -    this.parent = parent;
      -    this.children =
      -        (node.kind == 'ambiguous' ? [ node.selected ] : node.children || [])
      -            .map((c) => NodeView.make(c, this));
      -    this.index = index;
      -    this.node = node;
      -    views[index] = this;
      -
      -    this.span = this.buildSpan();
      -    this.tree = this.buildTree();
      -  }
      -
      -  // Replaces the token sequence in #code with a .
      -  buildSpan() {
      -    var elt = document.createElement('span');
      -    elt.dataset['index'] = this.index;
      -    elt.classList.add("node");
      -    elt.classList.add("selectable-node");
      -    elt.classList.add(this.node.kind);
      -
      -    var begin = null, end = null;
      -    if (this.children.length != 0) {
      -      begin = this.children[0].span;
      -      end = this.children[this.children.length - 1].span.nextSibling;
      -    } else if (this.node.kind == 'terminal') {
      -      begin = document.getElementById(this.node.token);
      -      end = begin.nextSibling;
      -    } else if (this.node.kind == 'opaque') {
      -      begin = document.getElementById(this.node.firstToken);
      -      end = (this.node.lastToken == null)
      -                ? begin
      -                : document.getElementById(this.node.lastToken).nextSibling;
      -    }
      -    var parent = begin.parentNode;
      -    splice(begin, end, elt);
      -    parent.insertBefore(elt, end);
      -    return elt;
      -  }
      -
      -  // Returns a (detached) 
    • suitable for use in #tree. - buildTree() { - var elt = document.createElement('li'); - elt.dataset['index'] = this.index; - elt.classList.add('tree-node'); - elt.classList.add('selectable-node'); - elt.classList.add(this.node.kind); - var header = document.createElement('header'); - elt.appendChild(header); - - if (this.abbrev.length > 0) { - var abbrev = document.createElement('span'); - abbrev.classList.add('abbrev'); - abbrev.innerText = forest[this.abbrev[0]].symbol; - header.appendChild(abbrev); - } - var name = document.createElement('span'); - name.classList.add('name'); - name.innerText = this.node.symbol; - header.appendChild(name); - - if (this.children.length != 0) { - var sublist = document.createElement('ul'); - this.children.forEach((c) => sublist.appendChild(c.tree)); - elt.appendChild(sublist); - } - return elt; - } - - // Make this view visible on the screen by scrolling if needed. - scrollVisible() { - scrollIntoViewV(document.getElementById('tree'), this.tree.firstChild); - scrollIntoViewV(document.getElementById('code'), this.span); - } - - // Fill #info with details of this node. - renderInfo() { - document.getElementById('info').classList = this.node.kind; - document.getElementById('i_symbol').innerText = this.node.symbol; - document.getElementById('i_kind').innerText = this.node.kind; - - // For sequence nodes, add LHS := RHS rule. - // If this node abbreviates trivial sequences, we want those rules too. - var rules = document.getElementById('i_rules'); - rules.textContent = ''; - function addRule(i) { - var ruleText = forest[i].rule; - if (ruleText == null) - return; - var rule = document.createElement('div'); - rule.classList.add('rule'); - rule.innerText = ruleText; - rules.insertBefore(rule, rules.firstChild); - } - this.abbrev.forEach(addRule); - addRule(this.index); - - // For ambiguous nodes, show a selectable list of alternatives. - var alternatives = document.getElementById('i_alternatives'); - alternatives.textContent = ''; - var that = this; - function addAlternative(i) { - var altNode = forest[i]; - var text = altNode.rule || altNode.kind; - var alt = document.createElement('div'); - alt.classList.add('alternative'); - alt.innerText = text; - alt.dataset['index'] = i; - alt.dataset['parent'] = that.index; - if (i == that.node.selected) - alt.classList.add('selected'); - alternatives.appendChild(alt); - } - if (this.node.kind == 'ambiguous') - this.node.children.forEach(addAlternative); - - // Show the stack of ancestor nodes. - // The part of each rule that leads to the current node is bolded. - var ancestors = document.getElementById('i_ancestors'); - ancestors.textContent = ''; - var child = this; - for (var view = this.parent; view != null; - child = view, view = view.parent) { - var indexInParent = view.children.indexOf(child); - - var ctx = document.createElement('div'); - ctx.classList.add('ancestors'); - ctx.classList.add('selectable-node'); - ctx.classList.add(view.node.kind); - if (view.node.rule) { - // Rule syntax is LHS := RHS1 [annotation] RHS2. - // We walk through the chunks and bold the one at parentInIndex. - var chunkCount = 0; - ctx.innerHTML = view.node.rule.replaceAll(/[^ ]+/g, function(match) { - if (!(match.startsWith('[') && match.endsWith(']')) /*annotations*/ - && chunkCount++ == indexInParent + 2 /*skip LHS :=*/) - return '' + match + ''; - return match; - }); - } else /*ambiguous*/ { - ctx.innerHTML = '' + view.node.symbol + ''; - } - ctx.dataset['index'] = view.index; - if (view.abbrev.length > 0) { - var abbrev = document.createElement('span'); - abbrev.classList.add('abbrev'); - abbrev.innerText = forest[view.abbrev[0]].symbol; - ctx.insertBefore(abbrev, ctx.firstChild); - } - - ctx.dataset['index'] = view.index; - ancestors.appendChild(ctx, ancestors.firstChild); - } - } - - remove() { - this.children.forEach((c) => c.remove()); - splice(this.span.firstChild, null, this.span.parentNode, - this.span.nextSibling); - detach(this.span); - delete views[this.index]; - } -}; - -var selection = null; -function selectView(view) { - var old = selection; - selection = view; - if (view == old) - return; - - if (old) { - old.tree.classList.remove('selected'); - old.span.classList.remove('selected'); - } - document.getElementById('info').hidden = (view == null); - if (!view) - return; - view.tree.classList.add('selected'); - view.span.classList.add('selected'); - view.renderInfo(); - view.scrollVisible(); -} - -// To highlight nodes on hover, we create dynamic CSS rules of the form -// .selectable-node[data-index="42"] { background-color: blue; } -// This avoids needing to find all the related nodes and update their classes. -var highlightSheet = new CSSStyleSheet(); -document.adoptedStyleSheets.push(highlightSheet); -function highlightView(view) { - var text = ''; - for (const color of ['#6af', '#bbb', '#ddd', '#eee']) { - if (view == null) - break; - text += '.selectable-node[data-index="' + view.index + '"] ' - text += '{ background-color: ' + color + '; }\n'; - view = view.parent; - } - highlightSheet.replace(text); -} - -// Select which branch of an ambiguous node is taken. -function chooseAlternative(parent, index) { - var parentView = views[parent]; - parentView.node.selected = index; - var oldChild = parentView.children[0]; - oldChild.remove(); - var newChild = NodeView.make(index, parentView); - parentView.children[0] = newChild; - parentView.tree.lastChild.replaceChild(newChild.tree, oldChild.tree); - - highlightView(null); - // Force redraw of the info box. - selectView(null); - selectView(parentView); -} - -// Attach event listeners and build content once the document is ready. -document.addEventListener("DOMContentLoaded", function() { - var code = document.getElementById('code'); - var tree = document.getElementById('tree'); - var ancestors = document.getElementById('i_ancestors'); - var alternatives = document.getElementById('i_alternatives'); - - [code, tree, ancestors].forEach(function(container) { - container.addEventListener('click', function(e) { - var nodeElt = e.target.closest('.selectable-node'); - selectView(nodeElt && views[Number(nodeElt.dataset['index'])]); - }); - container.addEventListener('mousemove', function(e) { - var nodeElt = e.target.closest('.selectable-node'); - highlightView(nodeElt && views[Number(nodeElt.dataset['index'])]); - }); - }); - - alternatives.addEventListener('click', function(e) { - var altElt = e.target.closest('.alternative'); - if (altElt) - chooseAlternative(Number(altElt.dataset['parent']), - Number(altElt.dataset['index'])); - }); - - // The HTML provides #code content in a hidden DOM element, move it. - var hiddenCode = document.getElementById('hidden-code'); - splice(hiddenCode.firstChild, hiddenCode.lastChild, code); - detach(hiddenCode); - - // Build the tree of NodeViews and attach to #tree. - tree.firstChild.appendChild(NodeView.make(0).tree); -}); - -// Helper DOM functions // - -// Moves the sibling range [first, until) into newParent. -function splice(first, until, newParent, before) { - for (var next = first; next != until;) { - var elt = next; - next = next.nextSibling; - newParent.insertBefore(elt, before); - } -} -function detach(node) { node.parentNode.removeChild(node); } -// Like scrollIntoView, but vertical only! -function scrollIntoViewV(container, elt) { - if (container.scrollTop > elt.offsetTop + elt.offsetHeight || - container.scrollTop + container.clientHeight < elt.offsetTop) - container.scrollTo({top : elt.offsetTop, behavior : 'smooth'}); -} diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt index 821ca4d0652e1..33db4fcd8e2ef 100644 --- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt +++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt @@ -5,13 +5,7 @@ set(LLVM_LINK_COMPONENTS add_custom_target(ClangPseudoUnitTests) add_unittest(ClangPseudoUnitTests ClangPseudoTests BracketTest.cpp - CXXTest.cpp DirectiveTreeTest.cpp - DisambiguateTest.cpp - ForestTest.cpp - GLRTest.cpp - GrammarTest.cpp - LRTableTest.cpp TokenTest.cpp ) @@ -24,8 +18,6 @@ clang_target_link_libraries(ClangPseudoTests target_link_libraries(ClangPseudoTests PRIVATE clangPseudo - clangPseudoCXX - clangPseudoGrammar LLVMTestingAnnotations LLVMTestingSupport ) diff --git a/clang-tools-extra/pseudo/unittests/CXXTest.cpp b/clang-tools-extra/pseudo/unittests/CXXTest.cpp deleted file mode 100644 index 505f958ae7556..0000000000000 --- a/clang-tools-extra/pseudo/unittests/CXXTest.cpp +++ /dev/null @@ -1,30 +0,0 @@ -//===--- CXXTest.cpp ------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/cxx/CXX.h" -#include "gtest/gtest.h" - -namespace clang { -namespace pseudo { -namespace cxx { -namespace { - -TEST(CXX, GeneratedEnums) { - const auto &Lang = clang::pseudo::cxx::getLanguage(); - EXPECT_EQ("iteration-statement", - Lang.G.symbolName(Symbol::iteration_statement)); - EXPECT_EQ("iteration-statement := DO statement WHILE ( expression ) ;", - Lang.G.dumpRule( - rule::iteration_statement:: - DO__statement__WHILE__L_PAREN__expression__R_PAREN__SEMI)); -} - -} // namespace -} // namespace cxx -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp b/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp deleted file mode 100644 index 2f483bb090660..0000000000000 --- a/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp +++ /dev/null @@ -1,111 +0,0 @@ -//===--- DisambiguateTest.cpp ---------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/Disambiguate.h" -#include "clang-pseudo/Forest.h" -#include "clang-pseudo/Token.h" -#include "clang/Basic/TokenKinds.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include - -namespace clang { -namespace pseudo { -namespace { -using testing::ElementsAre; -using testing::Pair; -using testing::UnorderedElementsAre; - -// Common disambiguation test fixture. -// This is the ambiguous forest representing parses of 'a * b;'. -class DisambiguateTest : public ::testing::Test { -protected: - // Greatly simplified C++ grammar. - enum Symbol : SymbolID { - Statement, - Declarator, - Expression, - DeclSpecifier, - Type, - Template, - }; - enum Rule : RuleID { - /* LHS__RHS1_RHS2 means LHS := RHS1 RHS2 */ - Statement__DeclSpecifier_Declarator_Semi, - Declarator__Star_Declarator, - Declarator__Identifier, - Statement__Expression_Semi, - Expression__Expression_Star_Expression, - Expression__Identifier, - DeclSpecifier__Type, - DeclSpecifier__Template, - Type__Identifier, - Template__Identifier, - }; - - ForestArena Arena; - ForestNode &A = Arena.createTerminal(tok::identifier, 0); - ForestNode &Star = Arena.createTerminal(tok::star, 1); - ForestNode &B = Arena.createTerminal(tok::identifier, 2); - ForestNode &Semi = Arena.createTerminal(tok::semi, 3); - - // Parse as multiplication expression. - ForestNode &AExpr = - Arena.createSequence(Expression, Expression__Identifier, &A); - ForestNode &BExpr = - Arena.createSequence(Expression, Expression__Identifier, &B); - ForestNode &Expr = - Arena.createSequence(Expression, Expression__Expression_Star_Expression, - {&AExpr, &Star, &BExpr}); - ForestNode &ExprStmt = Arena.createSequence( - Statement, Statement__Expression_Semi, {&Expr, &Semi}); - // Parse as declaration (`a` may be CTAD or not). - ForestNode &AType = - Arena.createSequence(DeclSpecifier, DeclSpecifier__Type, - &Arena.createSequence(Type, Type__Identifier, &A)); - ForestNode &ATemplate = Arena.createSequence( - DeclSpecifier, DeclSpecifier__Template, - &Arena.createSequence(Template, Template__Identifier, &A)); - ForestNode &DeclSpec = - Arena.createAmbiguous(DeclSpecifier, {&AType, &ATemplate}); - ForestNode &BDeclarator = - Arena.createSequence(Declarator, Declarator__Identifier, &B); - ForestNode &BPtr = Arena.createSequence( - Declarator, Declarator__Star_Declarator, {&Star, &BDeclarator}); - ForestNode &DeclStmt = - Arena.createSequence(Statement, Statement__DeclSpecifier_Declarator_Semi, - {&DeclSpec, &Star, &BDeclarator}); - // Top-level ambiguity - ForestNode &Stmt = Arena.createAmbiguous(Statement, {&ExprStmt, &DeclStmt}); -}; - -TEST_F(DisambiguateTest, Remove) { - Disambiguation D; - D.try_emplace(&Stmt, 1); // statement is a declaration, not an expression - D.try_emplace(&DeclSpec, 0); // a is a type, not a (CTAD) template - ForestNode *Root = &Stmt; - removeAmbiguities(Root, D); - - EXPECT_EQ(Root, &DeclStmt); - EXPECT_THAT(DeclStmt.elements(), ElementsAre(&AType, &Star, &BDeclarator)); -} - -TEST_F(DisambiguateTest, DummyStrategy) { - Disambiguation D = disambiguate(&Stmt, {}); - EXPECT_THAT(D, UnorderedElementsAre(Pair(&Stmt, 1), Pair(&DeclSpec, 1))); - - ForestNode *Root = &Stmt; - removeAmbiguities(Root, D); - EXPECT_EQ(Root, &DeclStmt); - EXPECT_THAT(DeclStmt.elements(), - ElementsAre(&ATemplate, &Star, &BDeclarator)); -} - -} // namespace -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/unittests/ForestTest.cpp b/clang-tools-extra/pseudo/unittests/ForestTest.cpp deleted file mode 100644 index 36af896148209..0000000000000 --- a/clang-tools-extra/pseudo/unittests/ForestTest.cpp +++ /dev/null @@ -1,180 +0,0 @@ -//===--- ForestTest.cpp - Test Forest dump ----------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/Forest.h" -#include "clang-pseudo/Token.h" -#include "clang/Basic/LangOptions.h" -#include "llvm/ADT/StringRef.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include - -namespace clang { -namespace pseudo { -namespace { - -// FIXME: extract to a TestGrammar class to allow code sharing among tests. -class ForestTest : public ::testing::Test { -public: - void build(llvm::StringRef BNF) { - Diags.clear(); - G = Grammar::parseBNF(BNF, Diags); - } - - SymbolID symbol(llvm::StringRef Name) const { - for (unsigned I = 0; I < NumTerminals; ++I) - if (G.table().Terminals[I] == Name) - return tokenSymbol(static_cast(I)); - for (SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID) - if (G.table().Nonterminals[ID].Name == Name) - return ID; - ADD_FAILURE() << "No such symbol found: " << Name; - return 0; - } - - RuleID ruleFor(llvm::StringRef NonterminalName) const { - auto RuleRange = G.table().Nonterminals[symbol(NonterminalName)].RuleRange; - if (RuleRange.End - RuleRange.Start == 1) - return G.table().Nonterminals[symbol(NonterminalName)].RuleRange.Start; - ADD_FAILURE() << "Expected a single rule for " << NonterminalName - << ", but it has " << RuleRange.End - RuleRange.Start - << " rule!\n"; - return 0; - } - -protected: - Grammar G; - std::vector Diags; -}; - -TEST_F(ForestTest, DumpBasic) { - build(R"cpp( - _ := add-expression EOF - add-expression := id-expression + id-expression - id-expression := IDENTIFIER - )cpp"); - ASSERT_TRUE(Diags.empty()); - ForestArena Arena; - const auto &TS = - cook(lex("a + b", clang::LangOptions()), clang::LangOptions()); - - auto T = Arena.createTerminals(TS); - ASSERT_EQ(T.size(), 4u); - const auto *Left = &Arena.createSequence( - symbol("id-expression"), ruleFor("id-expression"), {&T.front()}); - const auto *Right = &Arena.createSequence(symbol("id-expression"), - ruleFor("id-expression"), {&T[2]}); - - const auto *Add = - &Arena.createSequence(symbol("add-expression"), ruleFor("add-expression"), - {Left, &T[1], Right}); - EXPECT_EQ(Add->dumpRecursive(G, true), - "[ 0, end) add-expression := id-expression + id-expression\n" - "[ 0, 1) ├─id-expression~IDENTIFIER := tok[0]\n" - "[ 1, 2) ├─+ := tok[1]\n" - "[ 2, end) └─id-expression~IDENTIFIER := tok[2]\n"); - EXPECT_EQ(Add->dumpRecursive(G, false), - "[ 0, end) add-expression := id-expression + id-expression\n" - "[ 0, 1) ├─id-expression := IDENTIFIER\n" - "[ 0, 1) │ └─IDENTIFIER := tok[0]\n" - "[ 1, 2) ├─+ := tok[1]\n" - "[ 2, end) └─id-expression := IDENTIFIER\n" - "[ 2, end) └─IDENTIFIER := tok[2]\n"); -} - -TEST_F(ForestTest, DumpAmbiguousAndRefs) { - build(R"cpp( - _ := type EOF - type := class-type # rule 4 - type := enum-type # rule 5 - class-type := shared-type - enum-type := shared-type - shared-type := IDENTIFIER)cpp"); - ASSERT_TRUE(Diags.empty()); - ForestArena Arena; - const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions()); - - auto Terminals = Arena.createTerminals(TS); - ASSERT_EQ(Terminals.size(), 2u); - - const auto *SharedType = &Arena.createSequence( - symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()}); - const auto *ClassType = &Arena.createSequence( - symbol("class-type"), ruleFor("class-type"), {SharedType}); - const auto *EnumType = &Arena.createSequence( - symbol("enum-type"), ruleFor("enum-type"), {SharedType}); - const auto *Alternative1 = - &Arena.createSequence(symbol("type"), /*RuleID=*/4, {ClassType}); - const auto *Alternative2 = - &Arena.createSequence(symbol("type"), /*RuleID=*/5, {EnumType}); - const auto *Type = - &Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2}); - EXPECT_EQ(Type->dumpRecursive(G), - "[ 0, end) type := \n" - "[ 0, end) ├─type := class-type\n" - "[ 0, end) │ └─class-type := shared-type\n" - "[ 0, end) │ └─shared-type := IDENTIFIER #1\n" - "[ 0, end) │ └─IDENTIFIER := tok[0]\n" - "[ 0, end) └─type := enum-type\n" - "[ 0, end) └─enum-type := shared-type\n" - "[ 0, end) └─shared-type =#1\n"); -} - -TEST_F(ForestTest, DumpAbbreviatedShared) { - build(R"cpp( - _ := A - A := B - B := * - )cpp"); - - ForestArena Arena; - const auto *Star = &Arena.createTerminal(tok::star, 0); - - const auto *B = &Arena.createSequence(symbol("B"), ruleFor("B"), {Star}); - // We have two identical (but distinct) A nodes. - // The GLR parser would never produce this, but it makes the example simpler. - const auto *A1 = &Arena.createSequence(symbol("A"), ruleFor("A"), {B}); - const auto *A2 = &Arena.createSequence(symbol("A"), ruleFor("A"), {B}); - const auto *A = &Arena.createAmbiguous(symbol("A"), {A1, A2}); - - // We must not abbreviate away shared nodes: if we show A~* there's no way to - // show that the intermediate B node is shared between A1 and A2. - EXPECT_EQ(A->dumpRecursive(G, /*Abbreviate=*/true), - "[ 0, end) A := \n" - "[ 0, end) ├─A~B := * #1\n" - "[ 0, end) │ └─* := tok[0]\n" - "[ 0, end) └─A~B =#1\n"); -} - -TEST_F(ForestTest, Iteration) { - // Z - // / \ - // X Y - // |\| - // A B - ForestArena Arena; - const auto *A = &Arena.createTerminal(tok::identifier, 0); - const auto *B = &Arena.createOpaque(1, 0); - const auto *X = &Arena.createSequence(2, 1, {A, B}); - const auto *Y = &Arena.createSequence(2, 2, {B}); - const auto *Z = &Arena.createAmbiguous(2, {X, Y}); - - std::vector Nodes; - for (const ForestNode &N : Z->descendants()) - Nodes.push_back(&N); - EXPECT_THAT(Nodes, testing::UnorderedElementsAre(A, B, X, Y, Z)); - - Nodes.clear(); - for (const ForestNode &N : X->descendants()) - Nodes.push_back(&N); - EXPECT_THAT(Nodes, testing::UnorderedElementsAre(X, A, B)); -} - -} // namespace -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp deleted file mode 100644 index f361fb78247ac..0000000000000 --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ /dev/null @@ -1,789 +0,0 @@ -//===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/GLR.h" -#include "clang-pseudo/Bracket.h" -#include "clang-pseudo/Language.h" -#include "clang-pseudo/Token.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "clang/Basic/LangOptions.h" -#include "clang/Basic/TokenKinds.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/FormatVariadic.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include - -namespace clang { -namespace pseudo { - -llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const std::vector &Heads) { - for (const auto *Head : Heads) - OS << *Head << "\n"; - return OS; -} - -namespace { - -using StateID = LRTable::StateID; -using testing::AllOf; -using testing::ElementsAre; -using testing::IsEmpty; -using testing::UnorderedElementsAre; - -MATCHER_P(state, StateID, "") { return arg->State == StateID; } -MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; } -MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; } -MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; } - -testing::Matcher -parents(llvm::ArrayRef Parents) { - return testing::Property(&GSS::Node::parents, - testing::UnorderedElementsAreArray(Parents)); -} - -Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) { - EXPECT_GT(Begin, 0u); - const Token &Left = Code.tokens()[Begin - 1]; - EXPECT_EQ(Left.Kind, tok::l_brace); - if (const auto* Right = Left.pair()) { - EXPECT_EQ(Right->Kind, tok::r_brace); - return Code.index(*Right); - } - return Token::Invalid; -} - -class GLRTest : public ::testing::Test { -public: - void build(llvm::StringRef GrammarBNF) { - std::vector Diags; - TestLang.G = Grammar::parseBNF(GrammarBNF, Diags); - } - - TokenStream emptyTokenStream() { - TokenStream Empty; - Empty.finalize(); - return Empty; - } - - void buildGrammar(std::vector Nonterminals, - std::vector Rules) { - Nonterminals.push_back("_"); - llvm::sort(Nonterminals); - Nonterminals.erase(std::unique(Nonterminals.begin(), Nonterminals.end()), - Nonterminals.end()); - std::string FakeTestBNF; - for (const auto &NT : Nonterminals) - FakeTestBNF += llvm::formatv("{0} := {1}\n", "_", NT); - FakeTestBNF += llvm::join(Rules, "\n"); - build(FakeTestBNF); - } - - SymbolID id(llvm::StringRef Name) const { - for (unsigned I = 0; I < NumTerminals; ++I) - if (TestLang.G.table().Terminals[I] == Name) - return tokenSymbol(static_cast(I)); - for (SymbolID ID = 0; ID < TestLang.G.table().Nonterminals.size(); ++ID) - if (TestLang.G.table().Nonterminals[ID].Name == Name) - return ID; - ADD_FAILURE() << "No such symbol found: " << Name; - return 0; - } - ExtensionID extensionID(llvm::StringRef AttrValueName) const { - for (ExtensionID EID = 0; EID < TestLang.G.table().AttributeValues.size(); - ++EID) - if (TestLang.G.table().AttributeValues[EID] == AttrValueName) - return EID; - ADD_FAILURE() << "No such attribute value found: " << AttrValueName; - return 0; - } - - RuleID ruleFor(llvm::StringRef NonterminalName) const { - auto RuleRange = - TestLang.G.table().Nonterminals[id(NonterminalName)].RuleRange; - if (RuleRange.End - RuleRange.Start == 1) - return TestLang.G.table() - .Nonterminals[id(NonterminalName)] - .RuleRange.Start; - ADD_FAILURE() << "Expected a single rule for " << NonterminalName - << ", but it has " << RuleRange.End - RuleRange.Start - << " rule!\n"; - return 0; - } - -protected: - Language TestLang; - ForestArena Arena; - GSS GSStack; -}; - -TEST_F(GLRTest, ShiftMergingHeads) { - // Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1, - // 2 have shift actions to reach state 4, and the head 3 has a shift action to - // reach state 5: - // 0--1 - // └--2 - // └--3 - // After the shift action, the GSS (with new heads 4, 5) is: - // 0---1---4 - // └---2---┘ - // └---3---5 - auto *GSSNode0 = - GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); - auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr, - /*Parents=*/{GSSNode0}); - auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr, - /*Parents=*/{GSSNode0}); - auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/nullptr, - /*Parents=*/{GSSNode0}); - - buildGrammar({}, {}); // Create a fake empty grammar. - LRTable::Builder B(TestLang.G); - B.Transition[{StateID{1}, tokenSymbol(tok::semi)}] = StateID{4}; - B.Transition[{StateID{2}, tokenSymbol(tok::semi)}] = StateID{4}; - B.Transition[{StateID{3}, tokenSymbol(tok::semi)}] = StateID{5}; - TestLang.Table = std::move(B).build(); - - ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0); - std::vector NewHeads; - glrShift({GSSNode1, GSSNode2, GSSNode3}, SemiTerminal, - {emptyTokenStream(), Arena, GSStack}, TestLang, NewHeads); - - EXPECT_THAT(NewHeads, - UnorderedElementsAre(AllOf(state(4), parsedSymbol(&SemiTerminal), - parents({GSSNode1, GSSNode2})), - AllOf(state(5), parsedSymbol(&SemiTerminal), - parents({GSSNode3})))) - << NewHeads; -} - -TEST_F(GLRTest, ReduceConflictsSplitting) { - // Before (splitting due to R/R conflict): - // 0--1(IDENTIFIER) - // After reducing 1 by `class-name := IDENTIFIER` and - // `enum-name := IDENTIFIER`: - // 0--2(class-name) // 2 is goto(0, class-name) - // └--3(enum-name) // 3 is goto(0, enum-name) - buildGrammar({"class-name", "enum-name"}, - {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"}); - LRTable::Builder B(TestLang.G); - B.Transition[{StateID{0}, id("class-name")}] = StateID{2}; - B.Transition[{StateID{0}, id("enum-name")}] = StateID{3}; - B.Reduce[StateID{1}].insert(ruleFor("class-name")); - B.Reduce[StateID{1}].insert(ruleFor("enum-name")); - TestLang.Table = std::move(B).build(); - - const auto *GSSNode0 = - GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); - const auto *GSSNode1 = - GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0}); - - std::vector Heads = {GSSNode1}; - glrReduce(Heads, tokenSymbol(tok::eof), - {emptyTokenStream(), Arena, GSStack}, TestLang); - EXPECT_THAT(Heads, UnorderedElementsAre( - GSSNode1, - AllOf(state(2), parsedSymbolID(id("class-name")), - parents({GSSNode0})), - AllOf(state(3), parsedSymbolID(id("enum-name")), - parents({GSSNode0})))) - << Heads; -} - -TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) { - // Before (splitting due to multiple bases): - // 2(class-name)--4(*) - // 3(enum-name)---┘ - // After reducing 4 by `ptr-operator := *`: - // 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator) - // 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator) - buildGrammar({"ptr-operator", "class-name", "enum-name"}, - {"ptr-operator := *"}); - - auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0); - auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0); - - const auto *GSSNode2 = - GSStack.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{}); - const auto *GSSNode3 = - GSStack.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{}); - const auto *GSSNode4 = GSStack.addNode( - /*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1), - /*Parents=*/{GSSNode2, GSSNode3}); - - LRTable::Builder B(TestLang.G); - B.Transition[{StateID{2}, id("ptr-operator")}] = StateID{5}; - B.Transition[{StateID{3}, id("ptr-operator")}] = StateID{6}; - B.Reduce[StateID{4}].insert(ruleFor("ptr-operator")); - TestLang.Table = std::move(B).build(); - - std::vector Heads = {GSSNode4}; - glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack}, - TestLang); - - EXPECT_THAT(Heads, UnorderedElementsAre( - GSSNode4, - AllOf(state(5), parsedSymbolID(id("ptr-operator")), - parents({GSSNode2})), - AllOf(state(6), parsedSymbolID(id("ptr-operator")), - parents({GSSNode3})))) - << Heads; - // Verify that the payload of the two new heads is shared, only a single - // ptr-operator node is created in the forest. - EXPECT_EQ(Heads[1]->Payload, Heads[2]->Payload); -} - -TEST_F(GLRTest, ReduceJoiningWithMultipleBases) { - // Before (joining due to same goto state, multiple bases): - // 0--1(cv-qualifier)--3(class-name) - // └--2(cv-qualifier)--4(enum-name) - // After reducing 3 by `type-name := class-name` and - // 4 by `type-name := enum-name`: - // 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and - // └--2(cv-qualifier)--┘ // goto(2, type-name) - buildGrammar({"type-name", "class-name", "enum-name", "cv-qualifier"}, - {"type-name := class-name", "type-name := enum-name"}); - - auto *CVQualifierNode = - &Arena.createOpaque(id("cv-qualifier"), /*TokenIndex=*/0); - auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/1); - auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/1); - - const auto *GSSNode0 = - GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); - const auto *GSSNode1 = GSStack.addNode( - /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); - const auto *GSSNode2 = GSStack.addNode( - /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0}); - const auto *GSSNode3 = GSStack.addNode( - /*State=*/3, /*ForestNode=*/ClassNameNode, - /*Parents=*/{GSSNode1}); - const auto *GSSNode4 = - GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode, - /*Parents=*/{GSSNode2}); - - // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! - LRTable::Builder B(TestLang.G); - B.Transition[{StateID{1}, id("type-name")}] = StateID{5}; - B.Transition[{StateID{2}, id("type-name")}] = StateID{5}; - B.Reduce[StateID{3}].insert(/* type-name := class-name */ RuleID{0}); - B.Reduce[StateID{4}].insert(/* type-name := enum-name */ RuleID{1}); - TestLang.Table = std::move(B).build(); - - std::vector Heads = {GSSNode3, GSSNode4}; - glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack}, - TestLang); - - // Verify that the stack heads are joint at state 5 after reduces. - EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4, - AllOf(state(5), - parsedSymbolID(id("type-name")), - parents({GSSNode1, GSSNode2})))) - << Heads; - // Verify that we create an ambiguous ForestNode of two parses of `type-name`. - EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G), - "[ 1, end) type-name := \n" - "[ 1, end) ├─type-name := class-name\n" - "[ 1, end) │ └─class-name := \n" - "[ 1, end) └─type-name := enum-name\n" - "[ 1, end) └─enum-name := \n"); -} - -TEST_F(GLRTest, ReduceJoiningWithSameBase) { - // Before (joining due to same goto state, the same base): - // 0--1(class-name)--3(*) - // └--2(enum-name)--4(*) - // After reducing 3 by `pointer := class-name *` and - // 2 by `pointer := enum-name *`: - // 0--5(pointer) // 5 is goto(0, pointer) - buildGrammar({"pointer", "class-name", "enum-name"}, - {"pointer := class-name *", "pointer := enum-name *"}); - - auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0); - auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0); - auto *StartTerminal = &Arena.createTerminal(tok::star, /*TokenIndex=*/1); - - const auto *GSSNode0 = - GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); - const auto *GSSNode1 = - GSStack.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode, - /*Parents=*/{GSSNode0}); - const auto *GSSNode2 = - GSStack.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode, - /*Parents=*/{GSSNode0}); - const auto *GSSNode3 = - GSStack.addNode(/*State=*/3, /*ForestNode=*/StartTerminal, - /*Parents=*/{GSSNode1}); - const auto *GSSNode4 = - GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal, - /*Parents=*/{GSSNode2}); - - // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! - LRTable::Builder B(TestLang.G); - B.Transition[{StateID{0}, id("pointer")}] = StateID{5}; - B.Reduce[StateID{3}].insert(/* pointer := class-name */ RuleID{0}); - B.Reduce[StateID{4}].insert(/* pointer := enum-name */ RuleID{1}); - TestLang.Table = std::move(B).build(); - - std::vector Heads = {GSSNode3, GSSNode4}; - glrReduce(Heads, tokenSymbol(tok::eof), - {emptyTokenStream(), Arena, GSStack}, TestLang); - - EXPECT_THAT( - Heads, UnorderedElementsAre(GSSNode3, GSSNode4, - AllOf(state(5), parsedSymbolID(id("pointer")), - parents({GSSNode0})))) - << Heads; - EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G), - "[ 0, end) pointer := \n" - "[ 0, end) ├─pointer := class-name *\n" - "[ 0, 1) │ ├─class-name := \n" - "[ 1, end) │ └─* := tok[1]\n" - "[ 0, end) └─pointer := enum-name *\n" - "[ 0, 1) ├─enum-name := \n" - "[ 1, end) └─* := tok[1]\n"); -} - -TEST_F(GLRTest, ReduceLookahead) { - // A term can be followed by +, but not by -. - buildGrammar({"sum", "term"}, {"expr := term + term", "term := IDENTIFIER"}); - LRTable::Builder B(TestLang.G); - B.Transition[{StateID{0}, id("term")}] = StateID{2}; - B.Reduce[StateID{1}].insert(RuleID{0}); - TestLang.Table = std::move(B).build(); - - auto *Identifier = &Arena.createTerminal(tok::identifier, /*Start=*/0); - - const auto *Root = - GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{}); - const auto *GSSNode1 = - GSStack.addNode(/*State=*/1, /*ForestNode=*/Identifier, {Root}); - - // When the lookahead is +, reduce is performed. - std::vector Heads = {GSSNode1}; - glrReduce(Heads, tokenSymbol(tok::plus), {emptyTokenStream(), Arena, GSStack}, - TestLang); - EXPECT_THAT(Heads, - ElementsAre(GSSNode1, AllOf(state(2), parsedSymbolID(id("term")), - parents(Root)))); - - // When the lookahead is -, reduce is not performed. - Heads = {GSSNode1}; - glrReduce(Heads, tokenSymbol(tok::minus), - {emptyTokenStream(), Arena, GSStack}, TestLang); - EXPECT_THAT(Heads, ElementsAre(GSSNode1)); -} - -TEST_F(GLRTest, Recover) { - // Recovery while parsing "word" inside braces. - // Before: - // 0--1({)--2(?) - // After recovering a `word` at state 1: - // 0--3(word) // 3 is goto(1, word) - buildGrammar({"word", "top"}, {"top := { word [recover=Braces] }"}); - LRTable::Builder B(TestLang.G); - B.Transition[{StateID{1}, id("word")}] = StateID{3}; - B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}}); - TestLang.Table = std::move(B).build(); - TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces); - - auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); - auto *Question1 = &Arena.createTerminal(tok::question, 1); - const auto *Root = GSStack.addNode(0, nullptr, {}); - const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root}); - const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces}); - - // Need a token stream with paired braces so the strategy works. - clang::LangOptions LOptions; - TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions); - pairBrackets(Tokens); - std::vector NewHeads; - - unsigned TokenIndex = 2; - glrRecover({AfterQuestion1}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, - NewHeads); - EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace"; - EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")), - parents({OpenedBraces}), start(1u)))); - EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque); - - // Test recovery failure: omit closing brace so strategy fails - TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions); - pairBrackets(NoRBrace); - NewHeads.clear(); - TokenIndex = 2; - glrRecover({AfterQuestion1}, TokenIndex, {NoRBrace, Arena, GSStack}, TestLang, - NewHeads); - EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure"; - EXPECT_THAT(NewHeads, IsEmpty()); -} - -TEST_F(GLRTest, RecoverRightmost) { - // In a nested block structure, we recover at the innermost possible block. - // Before: - // 0--1({)--1({)--1({) - // After recovering a `block` at inside the second braces: - // 0--1({)--2(body) // 2 is goto(1, body) - buildGrammar({"body", "top"}, {"top := { body [recover=Braces] }"}); - LRTable::Builder B(TestLang.G); - B.Transition[{StateID{1}, id("body")}] = StateID{2}; - B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("body")}}); - TestLang.Table = std::move(B).build(); - TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces); - - clang::LangOptions LOptions; - // Innermost brace is unmatched, to test fallback to next brace. - TokenStream Tokens = cook(lex("{ { { ? } }", LOptions), LOptions); - Tokens.tokens()[0].Pair = 5; - Tokens.tokens()[1].Pair = 4; - Tokens.tokens()[4].Pair = 1; - Tokens.tokens()[5].Pair = 0; - - auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0); - auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1); - auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2); - const auto *Root = GSStack.addNode(0, nullptr, {}); - const auto *In1 = GSStack.addNode(1, Brace1, {Root}); - const auto *In2 = GSStack.addNode(1, Brace2, {In1}); - const auto *In3 = GSStack.addNode(1, Brace3, {In2}); - - unsigned TokenIndex = 3; - std::vector NewHeads; - glrRecover({In3}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, NewHeads); - EXPECT_EQ(TokenIndex, 5u); - EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")), - parents({In2}), start(2u)))); -} - -TEST_F(GLRTest, RecoverAlternatives) { - // Recovery inside braces with multiple equally good options - // Before: - // 0--1({) - // After recovering either `word` or `number` inside the braces: - // 0--1({)--2(word) // 2 is goto(1, word) - // └--3(number) // 3 is goto(1, number) - buildGrammar({"number", "word", "top"}, - { - "top := { number [recover=Braces] }", - "top := { word [recover=Braces] }", - }); - LRTable::Builder B(TestLang.G); - B.Transition[{StateID{1}, id("number")}] = StateID{2}; - B.Transition[{StateID{1}, id("word")}] = StateID{3}; - B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("number")}}); - B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}}); - TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces); - TestLang.Table = std::move(B).build(); - auto *LBrace = &Arena.createTerminal(tok::l_brace, 0); - const auto *Root = GSStack.addNode(0, nullptr, {}); - const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root}); - - clang::LangOptions LOptions; - TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions); - pairBrackets(Tokens); - std::vector NewHeads; - unsigned TokenIndex = 1; - - glrRecover({OpenedBraces}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, - NewHeads); - EXPECT_EQ(TokenIndex, 2u); - EXPECT_THAT(NewHeads, - UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")), - parents({OpenedBraces}), start(1u)), - AllOf(state(3), parsedSymbolID(id("word")), - parents({OpenedBraces}), start(1u)))); -} - -TEST_F(GLRTest, PerfectForestNodeSharing) { - // Run the GLR on a simple grammar and test that we build exactly one forest - // node per (SymbolID, token range). - - // This is a grmammar where the original parsing-stack-based forest node - // sharing approach will fail. In its LR0 graph, it has two states containing - // item `expr := • IDENTIFIER`, and both have different goto states on the - // nonterminal `expr`. - build(R"bnf( - _ := test EOF - - test := { expr - test := { IDENTIFIER - test := left-paren expr - left-paren := { - expr := IDENTIFIER - )bnf"); - TestLang.Table = LRTable::buildSLR(TestLang.G); - clang::LangOptions LOptions; - const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions); - - const ForestNode &Parsed = - glrParse({Tokens, Arena, GSStack}, id("test"), TestLang); - // Verify that there is no duplicated sequence node of `expr := IDENTIFIER` - // in the forest, see the `#1` and `=#1` in the dump string. - EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), - "[ 0, end) test := \n" - "[ 0, end) ├─test := { expr\n" - "[ 0, 1) │ ├─{ := tok[0]\n" - "[ 1, end) │ └─expr := IDENTIFIER #1\n" - "[ 1, end) │ └─IDENTIFIER := tok[1]\n" - "[ 0, end) ├─test := { IDENTIFIER\n" - "[ 0, 1) │ ├─{ := tok[0]\n" - "[ 1, end) │ └─IDENTIFIER := tok[1]\n" - "[ 0, end) └─test := left-paren expr\n" - "[ 0, 1) ├─left-paren := {\n" - "[ 0, 1) │ └─{ := tok[0]\n" - "[ 1, end) └─expr =#1\n"); -} - -TEST_F(GLRTest, GLRReduceOrder) { - // Given the following grammar, and the input `IDENTIFIER`, reductions should - // be performed in the following order: - // 1. foo := IDENTIFIER - // 2. { test := IDENTIFIER, test := foo } - // foo should be reduced first, so that in step 2 we have completed reduces - // for test, and form an ambiguous forest node. - build(R"bnf( - _ := test EOF - - test := IDENTIFIER - test := foo - foo := IDENTIFIER - )bnf"); - clang::LangOptions LOptions; - const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions); - TestLang.Table = LRTable::buildSLR(TestLang.G); - - const ForestNode &Parsed = - glrParse({Tokens, Arena, GSStack}, id("test"), TestLang); - EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), - "[ 0, end) test := \n" - "[ 0, end) ├─test := IDENTIFIER\n" - "[ 0, end) │ └─IDENTIFIER := tok[0]\n" - "[ 0, end) └─test := foo\n" - "[ 0, end) └─foo := IDENTIFIER\n" - "[ 0, end) └─IDENTIFIER := tok[0]\n"); -} - -TEST_F(GLRTest, RecoveryEndToEnd) { - // Simple example of brace-based recovery showing: - // - recovered region includes tokens both ahead of and behind the cursor - // - multiple possible recovery rules - // - recovery from outer scopes is rejected - build(R"bnf( - _ := block EOF - - block := { block [recover=Braces] } - block := { numbers [recover=Braces] } - numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT - )bnf"); - TestLang.Table = LRTable::buildSLR(TestLang.G); - TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces); - clang::LangOptions LOptions; - TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions); - pairBrackets(Tokens); - - const ForestNode &Parsed = - glrParse({Tokens, Arena, GSStack}, id("block"), TestLang); - EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), - "[ 0, end) block := { block [recover=Braces] }\n" - "[ 0, 1) ├─{ := tok[0]\n" - "[ 1, 5) ├─block := \n" - "[ 1, 5) │ ├─block := { block [recover=Braces] }\n" - "[ 1, 2) │ │ ├─{ := tok[1]\n" - "[ 2, 4) │ │ ├─block := \n" - "[ 4, 5) │ │ └─} := tok[4]\n" - "[ 1, 5) │ └─block := { numbers [recover=Braces] }\n" - "[ 1, 2) │ ├─{ := tok[1]\n" - "[ 2, 4) │ ├─numbers := \n" - "[ 4, 5) │ └─} := tok[4]\n" - "[ 5, end) └─} := tok[5]\n"); -} - -TEST_F(GLRTest, RecoverTerminal) { - build(R"bnf( - _ := stmt EOF - - stmt := IDENTIFIER ; [recover=Skip] - )bnf"); - TestLang.Table = LRTable::buildSLR(TestLang.G); - TestLang.RecoveryStrategies.try_emplace( - extensionID("Skip"), - [](Token::Index Start, const TokenStream &) { return Start; }); - clang::LangOptions LOptions; - TokenStream Tokens = cook(lex("foo", LOptions), LOptions); - - const ForestNode &Parsed = - glrParse({Tokens, Arena, GSStack}, id("stmt"), TestLang); - EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), - "[ 0, end) stmt := IDENTIFIER ; [recover=Skip]\n" - "[ 0, 1) ├─IDENTIFIER := tok[0]\n" - "[ 1, end) └─; := \n"); -} - -TEST_F(GLRTest, RecoverUnrestrictedReduce) { - // Here, ! is not in any rule and therefore not in the follow set of `word`. - // We would not normally reduce `word := IDENTIFIER`, but do so for recovery. - - build(R"bnf( - _ := sentence EOF - - word := IDENTIFIER - sentence := word word [recover=AcceptAnyTokenInstead] - )bnf"); - - clang::LangOptions LOptions; - const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions); - TestLang.Table = LRTable::buildSLR(TestLang.G); - TestLang.RecoveryStrategies.try_emplace( - extensionID("AcceptAnyTokenInstead"), - [](Token::Index Start, const TokenStream &Stream) { return Start + 1; }); - - const ForestNode &Parsed = - glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang); - EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), - "[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n" - "[ 0, 1) ├─word := IDENTIFIER\n" - "[ 0, 1) │ └─IDENTIFIER := tok[0]\n" - "[ 1, end) └─word := \n"); -} - -TEST_F(GLRTest, RecoveryFromStartOfInput) { - build(R"bnf( - _ := start [recover=Fallback] EOF - - start := IDENTIFIER - )bnf"); - TestLang.Table = LRTable::buildSLR(TestLang.G); - bool fallback_recovered = false; - auto fallback = [&](Token::Index Start, const TokenStream & Code) { - fallback_recovered = true; - return Code.tokens().size(); - }; - TestLang.RecoveryStrategies.try_emplace( - extensionID("Fallback"), - fallback); - clang::LangOptions LOptions; - TokenStream Tokens = cook(lex("?", LOptions), LOptions); - - const ForestNode &Parsed = - glrParse({Tokens, Arena, GSStack}, id("start"), TestLang); - EXPECT_TRUE(fallback_recovered); - EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), - "[ 0, end) start := \n"); -} - -TEST_F(GLRTest, RepeatedRecovery) { - // We require multiple steps of recovery at eof and then a reduction in order - // to successfully parse. - build(R"bnf( - _ := function EOF - # FIXME: this forces EOF to be in follow(signature). - # Remove it once we use unconstrained reduction for recovery. - _ := signature EOF - - function := signature body [recover=Skip] - signature := IDENTIFIER params [recover=Skip] - params := ( ) - body := { } - )bnf"); - TestLang.Table = LRTable::buildSLR(TestLang.G); - TestLang.RecoveryStrategies.try_emplace( - extensionID("Skip"), - [](Token::Index Start, const TokenStream &) { return Start; }); - clang::LangOptions LOptions; - TokenStream Tokens = cook(lex("main", LOptions), LOptions); - - const ForestNode &Parsed = - glrParse({Tokens, Arena, GSStack}, id("function"), TestLang); - EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), - "[ 0, end) function := signature body [recover=Skip]\n" - "[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n" - "[ 0, 1) │ ├─IDENTIFIER := tok[0]\n" - "[ 1, 1) │ └─params := \n" - "[ 1, end) └─body := \n"); -} - -TEST_F(GLRTest, NoExplicitAccept) { - build(R"bnf( - _ := test EOF - - test := IDENTIFIER test - test := IDENTIFIER - )bnf"); - clang::LangOptions LOptions; - // Given the following input, and the grammar above, we perform two reductions - // of the nonterminal `test` when the next token is `eof`, verify that the - // parser stops at the right state. - const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions); - TestLang.Table = LRTable::buildSLR(TestLang.G); - - const ForestNode &Parsed = - glrParse({Tokens, Arena, GSStack}, id("test"), TestLang); - EXPECT_EQ(Parsed.dumpRecursive(TestLang.G), - "[ 0, end) test := IDENTIFIER test\n" - "[ 0, 1) ├─IDENTIFIER := tok[0]\n" - "[ 1, end) └─test := IDENTIFIER\n" - "[ 1, end) └─IDENTIFIER := tok[1]\n"); -} - -TEST_F(GLRTest, GuardExtension) { - build(R"bnf( - _ := start EOF - - start := IDENTIFIER [guard] - )bnf"); - TestLang.Guards.try_emplace( - ruleFor("start"), [&](const GuardParams &P) { - assert(P.RHS.size() == 1 && - P.RHS.front()->symbol() == - tokenSymbol(clang::tok::identifier)); - return P.Tokens.tokens()[P.RHS.front()->startTokenIndex()] - .text() == "test"; - }); - clang::LangOptions LOptions; - TestLang.Table = LRTable::buildSLR(TestLang.G); - - std::string Input = "test"; - const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions); - EXPECT_EQ(glrParse({Succeeded, Arena, GSStack}, id("start"), TestLang) - .dumpRecursive(TestLang.G), - "[ 0, end) start := IDENTIFIER [guard]\n" - "[ 0, end) └─IDENTIFIER := tok[0]\n"); - - Input = "notest"; - const TokenStream &Failed = cook(lex(Input, LOptions), LOptions); - EXPECT_EQ(glrParse({Failed, Arena, GSStack}, id("start"), TestLang) - .dumpRecursive(TestLang.G), - "[ 0, end) start := \n"); -} - -TEST(GSSTest, GC) { - // ┌-A-┬-AB - // ├-B-┘ - // Root-+-C - // ├-D - // └-E - GSS GSStack; - auto *Root = GSStack.addNode(0, nullptr, {}); - auto *A = GSStack.addNode(0, nullptr, {Root}); - auto *B = GSStack.addNode(0, nullptr, {Root}); - auto *C = GSStack.addNode(0, nullptr, {Root}); - auto *D = GSStack.addNode(0, nullptr, {Root}); - auto *AB = GSStack.addNode(0, nullptr, {A, B}); - - EXPECT_EQ(1u, GSStack.gc({AB, C})) << "D is destroyed"; - EXPECT_EQ(0u, GSStack.gc({AB, C})) << "D is already gone"; - auto *E = GSStack.addNode(0, nullptr, {Root}); - EXPECT_EQ(D, E) << "Storage of GCed node D is reused for E"; - EXPECT_EQ(3u, GSStack.gc({A, E})) << "Destroys B, AB, C"; - EXPECT_EQ(1u, GSStack.gc({E})) << "Destroys A"; -} - -} // namespace -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp deleted file mode 100644 index 6b6b47b8a2dbe..0000000000000 --- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ /dev/null @@ -1,213 +0,0 @@ -//===--- GrammarTest.cpp - grammar tests -----------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/grammar/Grammar.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include - -namespace clang { -namespace pseudo { -namespace { - -using testing::AllOf; -using testing::ElementsAre; -using testing::IsEmpty; -using testing::Pair; -using testing::UnorderedElementsAre; - -MATCHER_P(TargetID, SID, "") { return arg.Target == SID; } -template testing::Matcher Sequence(T... IDs) { - return testing::Property(&Rule::seq, ElementsAre(IDs...)); -} - -class GrammarTest : public ::testing::Test { -public: - void build(llvm::StringRef BNF) { - Diags.clear(); - G = Grammar::parseBNF(BNF, Diags); - } - - SymbolID id(llvm::StringRef Name) const { - for (unsigned I = 0; I < NumTerminals; ++I) - if (G.table().Terminals[I] == Name) - return tokenSymbol(static_cast(I)); - for (SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID) - if (G.table().Nonterminals[ID].Name == Name) - return ID; - ADD_FAILURE() << "No such symbol found: " << Name; - return 0; - } - - RuleID ruleFor(llvm::StringRef NonterminalName) const { - auto RuleRange = G.table().Nonterminals[id(NonterminalName)].RuleRange; - if (RuleRange.End - RuleRange.Start == 1) - return G.table().Nonterminals[id(NonterminalName)].RuleRange.Start; - ADD_FAILURE() << "Expected a single rule for " << NonterminalName - << ", but it has " << RuleRange.End - RuleRange.Start - << " rule!\n"; - return 0; - } - -protected: - Grammar G; - std::vector Diags; -}; - -TEST_F(GrammarTest, Basic) { - build("_ := IDENTIFIER + _ # comment"); - EXPECT_THAT(Diags, IsEmpty()); - - auto ExpectedRule = - AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_"))); - EXPECT_EQ(G.symbolName(id("_")), "_"); - EXPECT_THAT(G.rulesFor(id("_")), UnorderedElementsAre(ExpectedRule)); - const auto &Rule = G.lookupRule(/*RID=*/0); - EXPECT_THAT(Rule, ExpectedRule); - EXPECT_THAT(G.symbolName(Rule.seq()[0]), "IDENTIFIER"); - EXPECT_THAT(G.symbolName(Rule.seq()[1]), "+"); - EXPECT_THAT(G.symbolName(Rule.seq()[2]), "_"); -} - -TEST_F(GrammarTest, EliminatedOptional) { - build("_ := CONST_opt INT ;_opt"); - EXPECT_THAT(Diags, IsEmpty()); - EXPECT_THAT(G.table().Rules, - UnorderedElementsAre(Sequence(id("INT")), - Sequence(id("CONST"), id("INT")), - Sequence(id("CONST"), id("INT"), id(";")), - Sequence(id("INT"), id(";")))); -} - -TEST_F(GrammarTest, RuleIDSorted) { - build(R"bnf( - _ := x - - x := y - y := z - z := IDENTIFIER - )bnf"); - ASSERT_TRUE(Diags.empty()); - - EXPECT_LT(ruleFor("z"), ruleFor("y")); - EXPECT_LT(ruleFor("y"), ruleFor("x")); - EXPECT_LT(ruleFor("x"), ruleFor("_")); -} - -TEST_F(GrammarTest, Annotation) { - build(R"bnf( - _ := x - x := IDENTIFIER [guard] - )bnf"); - ASSERT_THAT(Diags, IsEmpty()); - EXPECT_FALSE(G.lookupRule(ruleFor("_")).Guarded); - EXPECT_TRUE(G.lookupRule(ruleFor("x")).Guarded); -} - -TEST_F(GrammarTest, Diagnostics) { - build(R"cpp( - _ := ,_opt - _ := undefined-sym - null := - _ := IDENFIFIE # a typo of the terminal IDENFITIER - - invalid - # cycle - a := b - b := a - - _ := IDENTIFIER [unknown=value] - )cpp"); - - EXPECT_EQ(G.underscore(), id("_")); - EXPECT_THAT(Diags, UnorderedElementsAre( - "Rule '_ := ,_opt' has a nullable RHS", - "Rule 'null := ' has a nullable RHS", - "No rules for nonterminal: undefined-sym", - "Failed to parse 'invalid': no separator :=", - "Token-like name IDENFIFIE is used as a nonterminal", - "No rules for nonterminal: IDENFIFIE", - "The grammar contains a cycle involving symbol a", - "Unknown attribute 'unknown'")); -} - -TEST_F(GrammarTest, DuplicatedDiagnostics) { - build(R"cpp( - _ := test - - test := INT - test := DOUBLE - test := INT - )cpp"); - - EXPECT_THAT(Diags, UnorderedElementsAre("Duplicate rule: `test := INT`")); -} - -TEST_F(GrammarTest, FirstAndFollowSets) { - build( - R"bnf( -_ := expr -expr := expr - term -expr := term -term := IDENTIFIER -term := ( expr ) -)bnf"); - ASSERT_TRUE(Diags.empty()); - auto ToPairs = [](std::vector> Input) { - std::vector>> Sets; - for (SymbolID ID = 0; ID < Input.size(); ++ID) - Sets.emplace_back(ID, std::move(Input[ID])); - return Sets; - }; - - EXPECT_THAT( - ToPairs(firstSets(G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), - Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))), - Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("("))))); - EXPECT_THAT( - ToPairs(followSets(G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("EOF"))), - Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))), - Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))))); - - build(R"bnf( -# A simplfied C++ decl-specifier-seq. -_ := decl-specifier-seq -decl-specifier-seq := decl-specifier decl-specifier-seq -decl-specifier-seq := decl-specifier -decl-specifier := simple-type-specifier -decl-specifier := INLINE -simple-type-specifier := INT - )bnf"); - ASSERT_TRUE(Diags.empty()); - EXPECT_THAT( - ToPairs(firstSets(G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))), - Pair(id("decl-specifier-seq"), - UnorderedElementsAre(id("INLINE"), id("INT"))), - Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))), - Pair(id("decl-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"))))); - EXPECT_THAT( - ToPairs(followSets(G)), - UnorderedElementsAre( - Pair(id("_"), UnorderedElementsAre(id("EOF"))), - Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))), - Pair(id("decl-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))), - Pair(id("simple-type-specifier"), - UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))))); -} - -} // namespace -} // namespace pseudo -} // namespace clang diff --git a/clang-tools-extra/pseudo/unittests/LRTableTest.cpp b/clang-tools-extra/pseudo/unittests/LRTableTest.cpp deleted file mode 100644 index 9c9f18e03a3d4..0000000000000 --- a/clang-tools-extra/pseudo/unittests/LRTableTest.cpp +++ /dev/null @@ -1,76 +0,0 @@ -//===--- LRTableTest.cpp - ---------------------------------------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang-pseudo/grammar/LRTable.h" -#include "clang-pseudo/grammar/Grammar.h" -#include "clang/Basic/TokenKinds.h" -#include "llvm/Testing/Support/SupportHelpers.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include - -namespace clang { -namespace pseudo { -namespace { - -using llvm::ValueIs; -using testing::ElementsAre; -using StateID = LRTable::StateID; - -TEST(LRTable, Builder) { - std::vector GrammarDiags; - Grammar G = Grammar::parseBNF(R"bnf( - _ := expr # rule 0 - expr := term # rule 1 - expr := expr + term # rule 2 - term := IDENTIFIER # rule 3 - )bnf", - GrammarDiags); - EXPECT_THAT(GrammarDiags, testing::IsEmpty()); - - SymbolID Term = *G.findNonterminal("term"); - SymbolID Eof = tokenSymbol(tok::eof); - SymbolID Identifier = tokenSymbol(tok::identifier); - SymbolID Plus = tokenSymbol(tok::plus); - - LRTable::Builder B(G); - // eof IDENT term - // +-------+----+-------+------+ - // |state0 | | s0 | | - // |state1 | | | g3 | - // |state2 | | | | - // +-------+----+-------+------+------- - B.Transition[{StateID{0}, Identifier}] = StateID{0}; - B.Transition[{StateID{1}, Term}] = StateID{3}; - B.Reduce[StateID{0}].insert(RuleID{0}); - B.Reduce[StateID{1}].insert(RuleID{2}); - B.Reduce[StateID{2}].insert(RuleID{1}); - LRTable T = std::move(B).build(); - - EXPECT_EQ(T.getShiftState(0, Eof), std::nullopt); - EXPECT_THAT(T.getShiftState(0, Identifier), ValueIs(0)); - EXPECT_THAT(T.getReduceRules(0), ElementsAre(0)); - - EXPECT_EQ(T.getShiftState(1, Eof), std::nullopt); - EXPECT_EQ(T.getShiftState(1, Identifier), std::nullopt); - EXPECT_THAT(T.getGoToState(1, Term), ValueIs(3)); - EXPECT_THAT(T.getReduceRules(1), ElementsAre(2)); - - // Verify the behaivor for other non-available-actions terminals. - SymbolID Int = tokenSymbol(tok::kw_int); - EXPECT_EQ(T.getShiftState(2, Int), std::nullopt); - - // Check follow sets. - EXPECT_TRUE(T.canFollow(Term, Plus)); - EXPECT_TRUE(T.canFollow(Term, Eof)); - EXPECT_FALSE(T.canFollow(Term, Int)); -} - -} // namespace -} // namespace pseudo -} // namespace clang