diff options
Diffstat (limited to 'src/core/parser')
19 files changed, 2776 insertions, 788 deletions
diff --git a/src/core/parser/ParserStack.cpp b/src/core/parser/ParserStack.cpp deleted file mode 100644 index 1265851..0000000 --- a/src/core/parser/ParserStack.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <sstream> - -#include <core/common/Utils.hpp> -#include <core/common/Exceptions.hpp> -#include <core/model/Project.hpp> - -#include "ParserScope.hpp" -#include "ParserStack.hpp" - -namespace ousia { - -/* A default handler */ - -/** - * The DefaultHandler class is used in case no element handler is specified in - * the ParserState descriptor. - */ -class DefaultHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override {} - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DefaultHandler{handlerData}; - } -}; - -/* Class Handler */ - -void Handler::data(const std::string &data, int field) -{ - if (Utils::hasNonWhitepaceChar(data)) { - logger().error("Expected command but found character data."); - } -} - -/* Class ParserStack */ - -/** - * Returns an Exception that should be thrown when a currently invalid command - * is thrown. - */ -static LoggableException InvalidCommand(const std::string &name, - const std::set<std::string> &expected) -{ - if (expected.empty()) { - return LoggableException{ - std::string{"No nested elements allowed, but got \""} + name + - std::string{"\""}}; - } else { - return LoggableException{ - std::string{"Expected "} + - (expected.size() == 1 ? std::string{"\""} - : std::string{"one of \""}) + - Utils::join(expected, "\", \"") + std::string{"\", but got \""} + - name + std::string{"\""}}; - } -} - -ParserStack::ParserStack( - ParserContext &ctx, - const std::multimap<std::string, const ParserState *> &states) - : ctx(ctx), states(states) -{ -} - -bool ParserStack::deduceState() -{ - // Assemble all states - std::vector<const ParserState *> states; - for (const auto &e : this->states) { - states.push_back(e.second); - } - - // Fetch the type signature of the scope and derive all possible states, - // abort if no unique parser state was found - std::vector<const ParserState *> possibleStates = - ParserStateDeductor(ctx.getScope().getStackTypeSignature(), states) - .deduce(); - if (possibleStates.size() != 1) { - ctx.getLogger().error( - "Error while including file: Cannot deduce parser state."); - return false; - } - - // Switch to this state by creating a dummy handler - const ParserState *state = possibleStates[0]; - Handler *handler = - DefaultHandler::create({ctx, "", *state, *state, SourceLocation{}}); - stack.emplace(handler); - return true; -} - -std::set<std::string> ParserStack::expectedCommands() -{ - const ParserState *currentState = &(this->currentState()); - std::set<std::string> res; - for (const auto &v : states) { - if (v.second->parents.count(currentState)) { - res.insert(v.first); - } - } - return res; -} - -const ParserState &ParserStack::currentState() -{ - return stack.empty() ? ParserStates::None : stack.top()->state(); -} - -std::string ParserStack::currentCommandName() -{ - return stack.empty() ? std::string{} : stack.top()->name(); -} - -const ParserState *ParserStack::findTargetState(const std::string &name) -{ - const ParserState *currentState = &(this->currentState()); - auto range = states.equal_range(name); - for (auto it = range.first; it != range.second; it++) { - const ParserStateSet &parents = it->second->parents; - if (parents.count(currentState) || parents.count(&ParserStates::All)) { - return it->second; - } - } - - return nullptr; -} - -void ParserStack::start(const std::string &name, Variant::mapType &args, - const SourceLocation &location) -{ - ParserState const *targetState = findTargetState(name); -// TODO: Andreas, please improve this. -// if (!Utils::isIdentifier(name)) { -// throw LoggableException(std::string("Invalid identifier \"") + name + -// std::string("\"")); -// } - - if (targetState == nullptr) { - targetState = findTargetState("*"); - } - if (targetState == nullptr) { - throw InvalidCommand(name, expectedCommands()); - } - - // Fetch the associated constructor - HandlerConstructor ctor = targetState->elementHandler - ? targetState->elementHandler - : DefaultHandler::create; - - // Canonicalize the arguments, allow additional arguments - targetState->arguments.validateMap(args, ctx.getLogger(), true); - - // Instantiate the handler and call its start function - Handler *handler = ctor({ctx, name, *targetState, currentState(), location}); - handler->start(args); - stack.emplace(handler); -} - -void ParserStack::start(std::string name, const Variant::mapType &args, - const SourceLocation &location) -{ - Variant::mapType argsCopy(args); - start(name, argsCopy); -} - -void ParserStack::end() -{ - // Check whether the current command could be ended - if (stack.empty()) { - throw LoggableException{"No command to end."}; - } - - // Remove the current HandlerInstance from the stack - std::shared_ptr<Handler> inst{stack.top()}; - stack.pop(); - - // Call the end function of the last Handler - inst->end(); -} - -void ParserStack::data(const std::string &data, int field) -{ - // Check whether there is any command the data can be sent to - if (stack.empty()) { - throw LoggableException{"No command to receive data."}; - } - - // Pass the data to the current Handler instance - stack.top()->data(data, field); -} -} - diff --git a/src/core/parser/ParserStack.hpp b/src/core/parser/ParserStack.hpp deleted file mode 100644 index efc4e4a..0000000 --- a/src/core/parser/ParserStack.hpp +++ /dev/null @@ -1,361 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file ParserStack.hpp - * - * Helper classes for document or description parsers. Contains the ParserStack - * class, which is an pushdown automaton responsible for accepting commands in - * the correct order and calling specified handlers. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_PARSER_STACK_HPP_ -#define _OUSIA_PARSER_STACK_HPP_ - -#include <cstdint> - -#include <map> -#include <memory> -#include <set> -#include <stack> -#include <vector> - -#include <core/common/Variant.hpp> -#include <core/common/Logger.hpp> -#include <core/common/Argument.hpp> - -#include "Parser.hpp" -#include "ParserContext.hpp" -#include "ParserState.hpp" - -namespace ousia { - -/** - * Struct collecting all the data that is being passed to a Handler instance. - */ -struct HandlerData { - /** - * Reference to the ParserContext instance that should be used to resolve - * references to nodes in the Graph. - */ - ParserContext &ctx; - - /** - * Contains the name of the tag that is being handled. - */ - const std::string name; - - /** - * Contains the current state of the state machine. - */ - const ParserState &state; - - /** - * Contains the state of the state machine when the parent node was handled. - */ - const ParserState &parentState; - - /** - * Current source code location. - */ - const SourceLocation location; - - /** - * Constructor of the HandlerData class. - * - * @param ctx is the parser context the handler should be executed in. - * @param name is the name of the string. - * @param state is the state this handler was called for. - * @param parentState is the state of the parent command. - * @param location is the location at which the handler is created. - */ - HandlerData(ParserContext &ctx, std::string name, const ParserState &state, - const ParserState &parentState, const SourceLocation location) - : ctx(ctx), - name(std::move(name)), - state(state), - parentState(parentState), - location(location){}; -}; - -/** - * The handler class provides a context for handling an XML tag. It has to be - * overridden and registered in the StateStack class to form handlers for - * concrete XML tags. - */ -class Handler { -private: - /** - * Structure containing the internal handler data. - */ - const HandlerData handlerData; - -public: - /** - * Constructor of the Handler class. - * - * @param data is a structure containing all data being passed to the - * handler. - */ - Handler(const HandlerData &handlerData) : handlerData(handlerData){}; - - /** - * Virtual destructor. - */ - virtual ~Handler(){}; - - /** - * Returns a reference at the ParserContext. - * - * @return a reference at the ParserContext. - */ - ParserContext &context() { return handlerData.ctx; } - - /** - * Returns the command name for which the handler was created. - * - * @return a const reference at the command name. - */ - const std::string &name() { return handlerData.name; } - - /** - * Returns a reference at the ParserScope instance. - * - * @return a reference at the ParserScope instance. - */ - ParserScope &scope() { return handlerData.ctx.getScope(); } - - /** - * Returns a reference at the Manager instance which manages all nodes. - * - * @return a referance at the Manager instance. - */ - Manager &manager() { return handlerData.ctx.getManager(); } - - /** - * Returns a reference at the Logger instance used for logging error - * messages. - * - * @return a reference at the Logger instance. - */ - Logger &logger() { return handlerData.ctx.getLogger(); } - - /** - * Returns a reference at the Project Node, representing the project into - * which the file is currently being parsed. - * - * @return a referance at the Project Node. - */ - Rooted<Project> project() { return handlerData.ctx.getProject(); } - - /** - * Reference at the ParserState descriptor for which this Handler was - * created. - * - * @return a const reference at the constructing ParserState descriptor. - */ - const ParserState &state() { return handlerData.state; } - - /** - * Reference at the ParserState descriptor of the parent state of the state - * for which this Handler was created. Set to ParserStates::None if there - * is no parent state. - * - * @return a const reference at the parent state of the constructing - * ParserState descriptor. - */ - const ParserState &parentState() { return handlerData.parentState; } - - /** - * Returns the current location in the source file. - * - * @return the current location in the source file. - */ - SourceLocation location() { return handlerData.location; } - - /** - * Called when the command that was specified in the constructor is - * instanciated. - * - * @param args is a map from strings to variants (argument name and value). - */ - virtual void start(Variant::mapType &args) = 0; - - /** - * Called whenever the command for which this handler is defined ends. - */ - virtual void end() = 0; - - /** - * Called whenever raw data (int the form of a string) is available for the - * Handler instance. In the default handler an exception is raised if the - * received data contains non-whitespace characters. - * - * @param data is a pointer at the character data that is available for the - * Handler instance. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - virtual void data(const std::string &data, int field); -}; - -/** - * HandlerConstructor is a function pointer type used to create concrete - * instances of the Handler class. - * - * @param handlerData is the data that should be passed to the new handler - * instance. - * @return a newly created handler instance. - */ -using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); - -/** - * The ParserStack class is a pushdown automaton responsible for turning a - * command stream into a tree of Node instances. - */ -class ParserStack { -private: - /** - * Reference at the parser context. - */ - ParserContext &ctx; - - /** - * Map containing all registered command names and the corresponding - * state descriptors. - */ - const std::multimap<std::string, const ParserState *> &states; - - /** - * Internal stack used for managing the currently active Handler instances. - */ - std::stack<std::shared_ptr<Handler>> stack; - - /** - * Used internally to get all expected command names for the current state. - * This function is used to build error messages. - * - * @return a set of strings containing the names of the expected commands. - */ - std::set<std::string> expectedCommands(); - - /** - * Returns the targetState for a command with the given name that can be - * reached from for the current state. - * - * @param name is the name of the requested command. - * @return nullptr if no target state was found, a pointer at the target - *state - * otherwise. - */ - const ParserState *findTargetState(const std::string &name); - -public: - /** - * Creates a new instance of the ParserStack class. - * - * @param ctx is the parser context the parser stack is working on. - * @param states is a map containing the command names and pointers at the - * corresponding ParserState instances. - */ - ParserStack(ParserContext &ctx, - const std::multimap<std::string, const ParserState *> &states); - - /** - * Tries to reconstruct the parser state from the Scope instance of the - * ParserContext given in the constructor. This functionality is needed for - * including files,as the Parser of the included file needs to be brought to - + an equivalent state as the one in the including file. - * - * @param scope is the ParserScope instance from which the ParserState - * should be reconstructed. - * @param logger is the logger instance to which error messages should be - * written. - * @return true if the operation was sucessful, false otherwise. - */ - bool deduceState(); - - /** - * Returns the state the ParserStack instance currently is in. - * - * @return the state of the currently active Handler instance or STATE_NONE - * if no handler is on the stack. - */ - const ParserState ¤tState(); - - /** - * Returns the command name that is currently being handled. - * - * @return the name of the command currently being handled by the active - * Handler instance or an empty string if no handler is currently active. - */ - std::string currentCommandName(); - - /** - * Function that should be called whenever a new command starts. - * - * @param name is the name of the command. - * @param args is a map from strings to variants (argument name and value). - * Note that the passed map will be modified. - * @param location is the location in the source file at which the command - * starts. - */ - void start(const std::string &name, Variant::mapType &args, - const SourceLocation &location = SourceLocation{}); - - /** - * Function that should be called whenever a new command starts. - * - * @param name is the name of the command. - * @param args is a map from strings to variants (argument name and value). - * @param location is the location in the source file at which the command - * starts. - */ - void start(std::string name, - const Variant::mapType &args = Variant::mapType{}, - const SourceLocation &location = SourceLocation{}); - - /** - * Function called whenever a command ends. - */ - void end(); - - /** - * Function that should be called whenever data is available for the - * command. - * - * @param data is the data that should be passed to the handler. - * @param field is the field number (the interpretation of this value - * depends on the format that is being parsed). - */ - void data(const std::string &data, int field = 0); - - /** - * Returns a reference to the parser context the parser stack is currently - * working on. - * - * @return a reference to the parser context. - */ - ParserContext &getContext() { return ctx; } -}; -} - -#endif /* _OUSIA_PARSER_STACK_HPP_ */ - diff --git a/src/core/parser/generic/GenericParser.cpp b/src/core/parser/generic/GenericParser.cpp deleted file mode 100644 index e69de29..0000000 --- a/src/core/parser/generic/GenericParser.cpp +++ /dev/null diff --git a/src/core/parser/generic/GenericParser.hpp b/src/core/parser/generic/GenericParser.hpp deleted file mode 100644 index 4f29f94..0000000 --- a/src/core/parser/generic/GenericParser.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/** - * @file GenericParser.hpp - * - * The GenericParser class builds an abstraction layer that separates the - * underlying document format (e.g. osdm or osdmx) from the actual process of - * building the document model. It provides a set of genric functions that - * should be called by the inheriting concrete parser class, e.g. indicating a - * command with parameters, the start/end of a field or the start/end of an - * annotation. The GenericParser maintains an internal stack of - * ParserStateHandlers and relays the commands to the elements of this stack. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_GENERIC_PARSER_HPP_ -#define _OUSIA_GENERIC_PARSER_HPP_ - -#include <core/parser/Parseer.hpp> - -namespace ousia { - -class GenericParser : public Parser { - - - -}; - -} - -#endif _OUSIA_GENERIC_PARSER_HPP_ - diff --git a/src/core/parser/stack/Callbacks.cpp b/src/core/parser/stack/Callbacks.cpp new file mode 100644 index 0000000..6ebc549 --- /dev/null +++ b/src/core/parser/stack/Callbacks.cpp @@ -0,0 +1,23 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "Callbacks.hpp" + +namespace ousia { +} + diff --git a/src/core/parser/stack/Callbacks.hpp b/src/core/parser/stack/Callbacks.hpp new file mode 100644 index 0000000..9c61000 --- /dev/null +++ b/src/core/parser/stack/Callbacks.hpp @@ -0,0 +1,99 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Callbacks.hpp + * + * Contains an interface defining the callbacks that can be directed from a + * StateHandler to the StateStack, and from the StateStack to + * the actual parser. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_CALLBACKS_HPP_ +#define _OUSIA_PARSER_STACK_CALLBACKS_HPP_ + +#include <string> + +#include <core/common/Whitespace.hpp> + +namespace ousia { +namespace parser_stack { + +/** + * Interface defining a set of callback functions that act as a basis for the + * StateStackCallbacks and the ParserCallbacks. + */ +class Callbacks { +public: + /** + * Virtual descructor. + */ + virtual ~Callbacks() {}; + + /** + * Sets the whitespace mode that specifies how string data should be + * processed. + * + * @param whitespaceMode specifies one of the three WhitespaceMode constants + * PRESERVE, TRIM or COLLAPSE. + */ + virtual void setWhitespaceMode(WhitespaceMode whitespaceMode) = 0; + + /** + * Registers the given token as token that should be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be reported. + */ + virtual void registerToken(const std::string &token) = 0; + + /** + * Unregisters the given token, it will no longer be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be unregistered. + */ + virtual void unregisterToken(const std::string &token) = 0; +}; + +/** + * Interface defining the callback functions that can be passed from a + * StateStack to the underlying parser. + */ +class ParserCallbacks : public Callbacks { + /** + * Checks whether the given token is supported by the parser. The parser + * returns true, if the token is supported, false if this token cannot be + * registered. Note that parsers that do not support the registration of + * tokens at all should always return "true". + * + * @param token is the token that should be checked for support. + * @return true if the token is generally supported (or the parser does not + * support registering tokens at all), false if the token is not supported, + * because e.g. it is a reserved token or it interferes with other tokens. + */ + virtual bool supportsToken(const std::string &token) = 0; +}; + +} +} + +#endif /* _OUSIA_PARSER_STACK_CALLBACKS_HPP_ */ + diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 697f9e7..475fe69 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -26,7 +26,8 @@ #define _OUSIA_DOCUMENT_HANDLER_HPP_ #include <core/common/Variant.hpp> -#include <core/parser/ParserStack.hpp> + +#include "Handler.hpp" namespace ousia { @@ -35,11 +36,11 @@ class Rtti; class DocumentEntity; class FieldDescriptor; -class DocumentHandler : public Handler { +class DocumentHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -54,7 +55,7 @@ public: using Node::Node; }; -class DocumentChildHandler : public Handler { +class DocumentChildHandler : public StaticHandler { private: void preamble(Handle<Node> parentNode, std::string &fieldName, DocumentEntity *&parent, bool &inField); @@ -68,11 +69,11 @@ private: public: using Handler::Handler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; - void data(const std::string &data, int fieldIdx) override; + bool data(const Variant &data) override; static Handler *create(const HandlerData &handlerData) { diff --git a/src/core/parser/stack/DomainHandler.hpp b/src/core/parser/stack/DomainHandler.hpp index 7398812..5e8ea60 100644 --- a/src/core/parser/stack/DomainHandler.hpp +++ b/src/core/parser/stack/DomainHandler.hpp @@ -26,18 +26,19 @@ #define _OUSIA_DOMAIN_HANDLER_HPP_ #include <core/common/Variant.hpp> -#include <core/parser/ParserStack.hpp> + +#include "Handler.hpp" namespace ousia { // Forward declarations class Rtti; -class DomainHandler : public Handler { +class DomainHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -47,11 +48,11 @@ public: } }; -class DomainStructHandler : public Handler { +class DomainStructHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -61,11 +62,11 @@ public: } }; -class DomainAnnotationHandler : public Handler { +class DomainAnnotationHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -75,11 +76,11 @@ public: } }; -class DomainAttributesHandler : public Handler { +class DomainAttributesHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -89,11 +90,11 @@ public: } }; -class DomainFieldHandler : public Handler { +class DomainFieldHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -103,11 +104,11 @@ public: } }; -class DomainFieldRefHandler : public Handler { +class DomainFieldRefHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -117,11 +118,11 @@ public: } }; -class DomainPrimitiveHandler : public Handler { +class DomainPrimitiveHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -131,11 +132,11 @@ public: } }; -class DomainChildHandler : public Handler { +class DomainChildHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -154,11 +155,11 @@ namespace RttiTypes { extern const Rtti DomainParent; } -class DomainParentHandler : public Handler { +class DomainParentHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -168,11 +169,11 @@ public: } }; -class DomainParentFieldHandler : public Handler { +class DomainParentFieldHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; @@ -182,11 +183,11 @@ public: } }; -class DomainParentFieldRefHandler : public Handler { +class DomainParentFieldRefHandler : public StaticHandler { public: - using Handler::Handler; + using StaticHandler::StaticHandler; - void start(Variant::mapType &args) override; + bool start(Variant::mapType &args) override; void end() override; diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp new file mode 100644 index 0000000..a608f7f --- /dev/null +++ b/src/core/parser/stack/Handler.cpp @@ -0,0 +1,252 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <core/common/Exceptions.hpp> +#include <core/common/Logger.hpp> +#include <core/parser/ParserContext.hpp> + +#include "Callbacks.hpp" +#include "Handler.hpp" +#include "State.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class HandlerData */ + +HandlerData::HandlerData(ParserContext &ctx, /*Callbacks &callbacks,*/ + const std::string &name, const State &state, + const SourceLocation &location) + : ctx(ctx), + /*callbacks(callbacks),*/ + name(name), + state(state), + location(location) +{ +} + +/* Class Handler */ + +Handler::Handler(const HandlerData &handlerData) + : handlerData(handlerData), internalLogger(nullptr) +{ +} + +Handler::~Handler() {} + +ParserContext &Handler::context() { return handlerData.ctx; } + +ParserScope &Handler::scope() { return handlerData.ctx.getScope(); } + +Manager &Handler::manager() { return handlerData.ctx.getManager(); } + +Logger &Handler::logger() +{ + if (internalLogger != nullptr) { + return *internalLogger; + } + return handlerData.ctx.getLogger(); +} + +const SourceLocation &Handler::location() const { return handlerData.location; } + +void Handler::setWhitespaceMode(WhitespaceMode whitespaceMode) +{ + /*handlerData.callbacks.setWhitespaceMode(whitespaceMode);*/ +} + +void Handler::registerToken(const std::string &token) +{ + /*handlerData.callbacks.registerToken(token);*/ +} + +void Handler::unregisterToken(const std::string &token) +{ + /*handlerData.callbacks.unregisterToken(token);*/ +} + +const std::string &Handler::getName() const { return handlerData.name; } + +const State &Handler::getState() const { return handlerData.state; } + +void Handler::setLogger(Logger &logger) { internalLogger = &logger; } + +void Handler::resetLogger() { internalLogger = nullptr; } + +const SourceLocation &Handler::getLocation() const { return location(); } + +/* Class EmptyHandler */ + +bool EmptyHandler::start(const Variant::mapType &args) +{ + // Just accept anything + return true; +} + +void EmptyHandler::end() +{ + // Do nothing if a command ends +} + +bool EmptyHandler::fieldStart(bool &isDefaultField, size_t fieldIndex) +{ + // Accept any field + return true; +} + +void EmptyHandler::fieldEnd() +{ + // Do not handle fields +} + +bool EmptyHandler::annotationStart(const Variant &className, + const Variant::mapType &args) +{ + // Accept any data + return true; +} + +bool EmptyHandler::annotationEnd(const Variant &className, + const Variant &elementName) +{ + // Accept any annotation + return true; +} + +bool EmptyHandler::data(const Variant &data) +{ + // Support any data + return true; +} + +Handler *EmptyHandler::create(const HandlerData &handlerData) +{ + return new EmptyHandler(handlerData); +} + +/* Class StaticHandler */ + +bool StaticHandler::start(const Variant::mapType &args) +{ + // Do nothing in the default implementation, accept anything + return true; +} + +void StaticHandler::end() +{ + // Do nothing here +} + +bool StaticHandler::fieldStart(bool &isDefault, size_t fieldIdx) +{ + // Return true if either the default field is requested or the field index + // is zero. This simulates that there is exactly one field (a default field) + if (fieldIdx == 0) { + isDefault = true; + return true; + } + return false; +} + +void StaticHandler::fieldEnd() +{ + // Do nothing here +} + +bool StaticHandler::annotationStart(const Variant &className, + const Variant::mapType &args) +{ + // No annotations supported + return false; +} + +bool StaticHandler::annotationEnd(const Variant &className, + const Variant &elementName) +{ + // No annotations supported + return false; +} + +bool StaticHandler::data(const Variant &data) +{ + logger().error("Did not expect any data here", data); + return false; +} + +/* Class StaticFieldHandler */ + +StaticFieldHandler::StaticFieldHandler(const HandlerData &handlerData, + const std::string &argName) + : StaticHandler(handlerData), argName(argName), handled(false) +{ +} + +bool StaticFieldHandler::start(const Variant::mapType &args) +{ + if (!argName.empty()) { + auto it = args.find(argName); + if (it != args.end()) { + handled = true; + doHandle(it->second, args); + return true; + } + } + + this->args = args; + return true; +} + +void StaticFieldHandler::end() +{ + if (!handled) { + if (!argName.empty()) { + logger().error(std::string("Required argument \"") + argName + + std::string("\" is missing."), + location()); + } else { + logger().error("Command requires data, but no data given", + location()); + } + } +} + +bool StaticFieldHandler::data(const Variant &data) +{ + // Call the doHandle function if this has not been done before + if (!handled) { + handled = true; + doHandle(data, args); + return true; + } + + // The doHandle function was already called, print an error message + logger().error( + std::string("Found data, but the corresponding argument \"") + argName + + std::string("\" was already specified"), + data); + + // Print the location at which the attribute was originally specified + auto it = args.find(argName); + if (it != args.end()) { + logger().note(std::string("Attribute was specified here:"), it->second); + } + return false; +} +} +} + diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp new file mode 100644 index 0000000..eeaf555 --- /dev/null +++ b/src/core/parser/stack/Handler.hpp @@ -0,0 +1,414 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef _OUSIA_PARSER_STACK_HANDLER_HPP_ +#define _OUSIA_PARSER_STACK_HANDLER_HPP_ + +#include <string> + +#include <core/common/Location.hpp> +#include <core/common/Variant.hpp> +#include <core/common/Whitespace.hpp> + +namespace ousia { + +// Forward declarations +class ParserScope; +class ParserContext; +class Logger; + +namespace parser_stack { + +// More forward declarations +class Callbacks; +class State; + +/** + * Class collecting all the data that is being passed to a Handler + * instance. + */ +class HandlerData { +public: + /** + * Reference to the ParserContext instance that should be used to resolve + * references to nodes in the Graph. + */ + ParserContext &ctx; + + /** + * Reference at an instance of the Callbacks class, used for + * modifying the behaviour of the parser (like registering tokens, setting + * the data type or changing the whitespace handling mode). + */ + // Callbacks &callbacks; + + /** + * Contains the name of the command that is being handled. + */ + std::string name; + + /** + * Contains the current state of the state machine. + */ + const State &state; + + /** + * Current source code location. + */ + SourceLocation location; + + /** + * Constructor of the HandlerData class. + * + * @param ctx is the parser context the handler should be executed in. + * @param callbacks is an instance of Callbacks used to notify + * the parser about certain state changes. + * @param name is the name of the string. + * @param state is the state this handler was called for. + * @param location is the location at which the handler is created. + */ + HandlerData(ParserContext &ctx, + /*Callbacks &callbacks,*/ const std::string &name, + const State &state, const SourceLocation &location); +}; + +/** + * The Handler class provides a context for handling a generic stack element. + * It has to beoverridden and registered in the StateStack class to form + * handlers for concrete XML tags. + */ +class Handler { +private: + /** + * Structure containing the internal handler data. + */ + const HandlerData handlerData; + + /** + * Reference at the current logger. If not nullptr, this will override the + * logger from the ParserContext specified in the handlerData. + */ + Logger *internalLogger; + +protected: + /** + * Constructor of the Handler class. + * + * @param data is a structure containing all data being passed to the + * handler. + */ + Handler(const HandlerData &handlerData); + + /** + * Returns a reference at the ParserContext. + * + * @return a reference at the ParserContext. + */ + ParserContext &context(); + + /** + * Returns a reference at the ParserScope instance. + * + * @return a reference at the ParserScope instance. + */ + ParserScope &scope(); + + /** + * Returns a reference at the Manager instance which manages all nodes. + * + * @return a referance at the Manager instance. + */ + Manager &manager(); + + /** + * Returns a reference at the Logger instance used for logging error + * messages. + * + * @return a reference at the Logger instance. + */ + Logger &logger(); + + /** + * Returns the location of the element in the source file, for which this + * Handler was created. + * + * @return the location of the Handler in the source file. + */ + const SourceLocation &location() const; + +public: + /** + * Virtual destructor. + */ + virtual ~Handler(); + + /** + * Calls the corresponding function in the Callbacks instance. Sets the + * whitespace mode that specifies how string data should be processed. The + * calls to this function are placed on a stack by the underlying Stack + * class. + * + * @param whitespaceMode specifies one of the three WhitespaceMode constants + * PRESERVE, TRIM or COLLAPSE. + */ + void setWhitespaceMode(WhitespaceMode whitespaceMode); + + /** + * Calls the corresponding function in the Callbacks instance. + * Registers the given token as token that should be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be reported. + */ + void registerToken(const std::string &token); + + /** + * Calls the corresponding function in the Callbacks instance. + * Unregisters the given token, it will no longer be reported to the handler + * using the "token" function. + * + * @param token is the token string that should be unregistered. + */ + void unregisterToken(const std::string &token); + + /** + * Returns the command name for which the handler was created. + * + * @return a const reference at the command name. + */ + const std::string &getName() const; + + /** + * Reference at the State descriptor for which this Handler was created. + * + * @return a const reference at the constructing State descriptor. + */ + const State &getState() const; + + /** + * Sets the internal logger to the given logger instance. + * + * @param logger is the Logger instance to which the logger should be set. + */ + void setLogger(Logger &logger); + + /** + * Resets the logger instance to the logger instance provided in the + * ParserContext. + */ + void resetLogger(); + + /** + * Returns the location of the element in the source file, for which this + * Handler was created. + * + * @return the location of the Handler in the source file. + */ + const SourceLocation &getLocation() const; + + /** + * Called when the command that was specified in the constructor is + * instanciated. + * + * @param args is a map from strings to variants (argument name and value). + * @return true if the handler was successful in starting the element it + * represents, false otherwise. + */ + virtual bool start(const Variant::mapType &args) = 0; + + /** + * Called before the command for which this handler is defined ends (is + * forever removed from the stack). + */ + virtual void end() = 0; + + /** + * Called when a new field starts, while the handler is active. This + * function should return true if the field is supported, false otherwise. + * No error should be logged if the field cannot be started, the caller will + * take care of that (since it is always valid to start a default field, + * even though the corresponding structure does not have a field, as long as + * no data is fed into the field). + * + * @param isDefault is set to true if the field that is being started is the + * default/tree field. The handler should set the value of this variable to + * true if the referenced field is indeed the default field. + * @param fieldIdx is the numerical index of the field. + */ + virtual bool fieldStart(bool &isDefault, size_t fieldIdx) = 0; + + /** + * Called when a previously opened field ends, while the handler is active. + * Note that a "fieldStart" and "fieldEnd" are always called alternately. + */ + virtual void fieldEnd() = 0; + + /** + * Called whenever an annotation starts while this handler is active. The + * function should return true if starting the annotation was successful, + * false otherwise. + * + * @param className is a string variant containing the name of the + * annotation class and the location of the name in the source code. + * @param args is a map from strings to variants (argument name and value). + * @return true if the mentioned annotation could be started here, false + * if an error occurred. + */ + virtual bool annotationStart(const Variant &className, + const Variant::mapType &args) = 0; + + /** + * Called whenever an annotation ends while this handler is active. The + * function should return true if ending the annotation was successful, + * false otherwise. + * + * @param className is a string variant containing the name of the + * annotation class and the location of the class name in the source code. + * @param elementName is a string variant containing the name of the + * annotation class and the location of the element name in the source code. + * @return true if the mentioned annotation could be started here, false if + * an error occurred. + */ + virtual bool annotationEnd(const Variant &className, + const Variant &elementName) = 0; + + /** + * Called whenever raw data (int the form of a string) is available for the + * Handler instance. Should return true if the data could be handled, false + * otherwise. + * + * @param data is a string variant containing the character data and its + * location. + * @return true if the data could be handled, false otherwise. + */ + virtual bool data(const Variant &data) = 0; +}; + +/** + * HandlerConstructor is a function pointer type used to create concrete + * instances of the Handler class. + * + * @param handlerData is the data that should be passed to the new handler + * instance. + * @return a newly created handler instance. + */ +using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); + +/** + * The EmptyHandler class is used in case no element handler is specified in + * the State descriptor. It just accepts all data and does nothing. + */ +class EmptyHandler : public Handler { +protected: + using Handler::Handler; + +public: + bool start(const Variant::mapType &args) override; + void end() override; + bool fieldStart(bool &isDefault, size_t fieldIdx) override; + void fieldEnd() override; + bool annotationStart(const Variant &className, + const Variant::mapType &args) override; + bool annotationEnd(const Variant &className, + const Variant &elementName) override; + bool data(const Variant &data) override; + + /** + * Creates an instance of the EmptyHandler class. + */ + static Handler *create(const HandlerData &handlerData); +}; + +/** + * The StaticHandler class is used to handle predifined commands which do + * neither support annotations, nor multiple fields. Child classes can decide + * whether a single data field should be used. + */ +class StaticHandler : public Handler { +protected: + using Handler::Handler; + +public: + bool start(const Variant::mapType &args) override; + void end() override; + bool fieldStart(bool &isDefault, size_t fieldIdx) override; + void fieldEnd() override; + bool annotationStart(const Variant &className, + const Variant::mapType &args) override; + bool annotationEnd(const Variant &className, + const Variant &elementName) override; + bool data(const Variant &data) override; +}; + +/** + * The StaticFieldHandler class is used to handle predifined commands which do + * neither support annotations, nor multiple fields. Additionally, it captures a + * data entry from a single default field. + */ +class StaticFieldHandler : public StaticHandler { +private: + /** + * Set to the name of the data argument that should be used instead of the + * data field, if no data field is given. + */ + std::string argName; + + /** + * Set to true, once the "doHandle" function has been called. + */ + bool handled; + + /** + * Map containing the arguments given in the start function. + */ + Variant::mapType args; + +protected: + /** + * Constructor of the StaticFieldHandler class. + * + * @param handlerData is a structure containing the internal data that + * should be stored inside the handler. + * @param name of the data argument that -- if present -- should be used + * instead of the data field. If empty, data is not captured from the + * arguments. If both, data in the data field and the argument, are given, + * this results in an error. + */ + StaticFieldHandler(const HandlerData &handlerData, + const std::string &argName); + + /** + * Function that should be overriden in order to handle the field data and + * the other arguments. This function is not called if no data was given. + * + * @param fieldData is the captured field data. + * @param args are the arguments that were given in the "start" function. + */ + virtual void doHandle(const Variant &fieldData, + const Variant::mapType &args) = 0; + +public: + bool start(const Variant::mapType &args) override; + void end() override; + bool data(const Variant &data) override; +}; +} +} + +#endif /* _OUSIA_PARSER_STACK_HANDLER_HPP_ */ + diff --git a/src/core/parser/stack/ImportIncludeHandler.hpp b/src/core/parser/stack/ImportIncludeHandler.hpp index b0767be..f9abe55 100644 --- a/src/core/parser/stack/ImportIncludeHandler.hpp +++ b/src/core/parser/stack/ImportIncludeHandler.hpp @@ -19,6 +19,9 @@ /** * @file ImportIncludeHandler.hpp * + * Contains the conceptually similar handlers for the "include" and "import" + * commands. + * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -30,43 +33,54 @@ namespace ousia { -class ImportIncludeHandler : public Handler { -protected: - bool srcInArgs = false; - std::string rel; - std::string type; - std::string src; - -public: - using Handler::Handler; - - void start(Variant::mapType &args) override; - - void data(const std::string &data, int field) override; -}; - -class ImportHandler : public ImportIncludeHandler { +/** + * The ImportHandler is responsible for handling the "import" command. An import + * creates a reference to a specified file. The specified file is parsed (if + * this has not already been done) outside of the context of the current file. + * If the specified resource has already been parsed, a reference to the already + * parsed file is inserted. Imports are only possible before no other content + * has been parsed. + */ +class ImportHandler : public StaticFieldHandler { public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override; - - void end() override; - + using StaticFieldHandler::StaticFieldHandler; + + void doHandle(const Variant &fieldData, + const Variant::mapType &args) override; + + /** + * Creates a new instance of the ImportHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new ImportHandler{handlerData}; } }; -class IncludeHandler : public ImportIncludeHandler { +/** + * The IncludeHandler is responsible for handling the "include" command. The + * included file is parsed in the context of the current file and will change + * the content that is currently being parsed. Includes are possible at (almost) + * any position in the source file. + */ +class IncludeHandler : public StaticFieldHandler { public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override; - - void end() override; - + using StaticFieldHandler::StaticFieldHandler; + + void doHandle(const Variant &fieldData, + const Variant::mapType &args) override; + + /** + * Creates a new instance of the IncludeHandler. + * + * @param handlerData is the data that is passed to the constructor of the + * Handler base class and used there to e.g. access the ParserContext and + * the Callbacks instance. + */ static Handler *create(const HandlerData &handlerData) { return new IncludeHandler{handlerData}; diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp new file mode 100644 index 0000000..d84a19c --- /dev/null +++ b/src/core/parser/stack/Stack.cpp @@ -0,0 +1,554 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <sstream> + +#include <core/common/Logger.hpp> +#include <core/common/Utils.hpp> +#include <core/common/Exceptions.hpp> +#include <core/parser/ParserScope.hpp> +#include <core/parser/ParserContext.hpp> + +#include "Handler.hpp" +#include "Stack.hpp" +#include "State.hpp" + +namespace ousia { +namespace parser_stack { + +/* Class HandlerInfo */ + +HandlerInfo::HandlerInfo() : HandlerInfo(nullptr) {} + +HandlerInfo::HandlerInfo(std::shared_ptr<Handler> handler) + : handler(handler), + fieldIdx(0), + valid(true), + implicit(false), + inField(false), + inDefaultField(false), + inImplicitDefaultField(false), + inValidField(false), + hadDefaultField(false) +{ +} + +HandlerInfo::HandlerInfo(bool valid, bool implicit, bool inField, + bool inDefaultField, bool inImplicitDefaultField, + bool inValidField) + : handler(nullptr), + fieldIdx(0), + valid(valid), + implicit(implicit), + inField(inField), + inDefaultField(inDefaultField), + inImplicitDefaultField(inImplicitDefaultField), + inValidField(inValidField), + hadDefaultField(false) +{ +} + +HandlerInfo::~HandlerInfo() +{ + // Do nothing +} + +void HandlerInfo::fieldStart(bool isDefault, bool isImplicit, bool isValid) +{ + inField = true; + inDefaultField = isDefault || isImplicit; + inImplicitDefaultField = isImplicit; + inValidField = isValid; + hadDefaultField = hadDefaultField || inDefaultField; + fieldIdx++; +} + +void HandlerInfo::fieldEnd() +{ + inField = false; + inDefaultField = false; + inImplicitDefaultField = false; + inValidField = false; +} + +/** + * Stub instance of HandlerInfo containing no handler information. + */ +static HandlerInfo EmptyHandlerInfo{true, true, true, true, false, true}; + +/* Helper functions */ + +/** + * Returns an Exception that should be thrown when a currently invalid command + * is thrown. + * + * @param name is the name of the command for which no state transition is + * found. + * @param expected is a set containing the names of the expected commands. + */ +static LoggableException buildInvalidCommandException( + const std::string &name, const std::set<std::string> &expected) +{ + if (expected.empty()) { + return LoggableException{ + std::string{"No nested elements allowed, but got \""} + name + + std::string{"\""}}; + } else { + return LoggableException{ + std::string{"Expected "} + + (expected.size() == 1 ? std::string{"\""} + : std::string{"one of \""}) + + Utils::join(expected, "\", \"") + std::string{"\", but got \""} + + name + std::string{"\""}}; + } +} + +/* Class Stack */ + +Stack::Stack(ParserContext &ctx, + const std::multimap<std::string, const State *> &states) + : ctx(ctx), states(states) +{ + // If the scope instance is not empty we need to deduce the current parser + // state + if (!ctx.getScope().isEmpty()) { + deduceState(); + } +} + +Stack::~Stack() +{ + while (!stack.empty()) { + // Fetch the topmost stack element + HandlerInfo &info = currentInfo(); + + // It is an error if we're still in a field of an element while the + // Stack instance is destroyed. Log that + if (handlersValid()) { + if (info.inField && !info.implicit && + !info.inImplicitDefaultField) { + logger().error( + std::string("Reached end of stream, but command \"") + + info.handler->getName() + + "\" has not ended yet. Command was started here:", + info.handler->getLocation()); + } + } + + // Remove the command from the stack + endCurrentHandler(); + } +} + +void Stack::deduceState() +{ + // Assemble all states + std::vector<const State *> states; + for (const auto &e : this->states) { + states.push_back(e.second); + } + + // Fetch the type signature of the scope and derive all possible states, + // abort if no unique parser state was found + std::vector<const State *> possibleStates = + StateDeductor(ctx.getScope().getStackTypeSignature(), states).deduce(); + if (possibleStates.size() != 1U) { + throw LoggableException( + "Error while including file: Cannot deduce parser state."); + } + + // Switch to this state by creating a handler, but do not call its start + // function + const State &state = *possibleStates[0]; + HandlerConstructor ctor = + state.elementHandler ? state.elementHandler : EmptyHandler::create; + + std::shared_ptr<Handler> handler = + std::shared_ptr<Handler>{ctor({ctx, "", state, SourceLocation{}})}; + stack.emplace_back(handler); + + // Set the correct flags for this implicit handler + HandlerInfo &info = currentInfo(); + info.implicit = true; + info.fieldStart(true, false, true); +} + +std::set<std::string> Stack::expectedCommands() +{ + const State *currentState = &(this->currentState()); + std::set<std::string> res; + for (const auto &v : states) { + if (v.second->parents.count(currentState)) { + res.insert(v.first); + } + } + return res; +} + +const State &Stack::currentState() +{ + return stack.empty() ? States::None : stack.back().handler->getState(); +} + +std::string Stack::currentCommandName() +{ + return stack.empty() ? std::string{} : stack.back().handler->getName(); +} + +const State *Stack::findTargetState(const std::string &name) +{ + const State *currentState = &(this->currentState()); + auto range = states.equal_range(name); + for (auto it = range.first; it != range.second; it++) { + const StateSet &parents = it->second->parents; + if (parents.count(currentState) || parents.count(&States::All)) { + return it->second; + } + } + + return nullptr; +} + +const State *Stack::findTargetStateOrWildcard(const std::string &name) +{ + // Try to find the target state with the given name, if none is found, try + // find a matching "*" state. + State const *targetState = findTargetState(name); + if (targetState == nullptr) { + return findTargetState("*"); + } + return targetState; +} + +HandlerInfo &Stack::currentInfo() +{ + return stack.empty() ? EmptyHandlerInfo : stack.back(); +} +HandlerInfo &Stack::lastInfo() +{ + return stack.size() < 2U ? EmptyHandlerInfo : stack[stack.size() - 2]; +} + +void Stack::endCurrentHandler() +{ + if (!stack.empty()) { + // Fetch the handler info for the current top-level element + HandlerInfo &info = stack.back(); + + // Do not call any callback functions while the stack is marked as + // invalid or this is an elment marked as "implicit" + if (!info.implicit && handlersValid()) { + // Make sure the fieldEnd handler is called if the element still + // is in a field + if (info.inField) { + info.handler->fieldEnd(); + info.fieldEnd(); + } + + // Call the "end" function of the corresponding Handler instance + info.handler->end(); + } + + // Remove the element from the stack + stack.pop_back(); + } +} + +bool Stack::ensureHandlerIsInField() +{ + // If the current handler is not in a field (and actually has a handler) + // try to start a default field + HandlerInfo &info = currentInfo(); + if (!info.inField && info.handler != nullptr) { + // Abort if the element already had a default field + if (info.hadDefaultField) { + return false; + } + + // Try to start a new default field, abort if this did not work + bool isDefault = true; + if (!info.handler->fieldStart(isDefault, info.fieldIdx)) { + info.handler->fieldEnd(); + endCurrentHandler(); + return false; + } + + // Mark the field as started + info.fieldStart(true, true, true); + } + return true; +} + +bool Stack::handlersValid() +{ + for (auto it = stack.crbegin(); it != stack.crend(); it++) { + if (!it->valid) { + return false; + } + } + return true; +} + +Logger &Stack::logger() { return ctx.getLogger(); } + +void Stack::command(const Variant &name, const Variant::mapType &args) +{ + // Make sure the given identifier is valid (preventing "*" from being + // malicously passed to this function) + if (!Utils::isNamespacedIdentifier(name.asString())) { + throw LoggableException(std::string("Invalid identifier \"") + + name.asString() + std::string("\""), + name); + } + + State const *lastTargetState = nullptr; + Variant::mapType canonicalArgs; + while (true) { + // Try to find a target state for the given command, if none can be + // found and the current command does not have an open field, then try + // to create an empty default field, otherwise this is an exception + const State *targetState = findTargetStateOrWildcard(name.asString()); + if (targetState == nullptr) { + if (!currentInfo().inField) { + endCurrentHandler(); + continue; + } else { + throw buildInvalidCommandException(name.asString(), + expectedCommands()); + } + } + + // Make sure we're currently inside a field + if (!ensureHandlerIsInField()) { + endCurrentHandler(); + continue; + } + + // Fork the logger. We do not want any validation errors to skip + LoggerFork loggerFork = logger().fork(); + + // Canonicalize the arguments (if this has not already been done), allow + // additional arguments + if (lastTargetState != targetState) { + canonicalArgs = args; + targetState->arguments.validateMap(canonicalArgs, loggerFork, true); + lastTargetState = targetState; + } + + // Instantiate the handler and push it onto the stack + HandlerConstructor ctor = targetState->elementHandler + ? targetState->elementHandler + : EmptyHandler::create; + std::shared_ptr<Handler> handler{ + ctor({ctx, name.asString(), *targetState, name.getLocation()})}; + stack.emplace_back(handler); + + // Fetch the HandlerInfo for the parent element and the current element + HandlerInfo &parentInfo = lastInfo(); + HandlerInfo &info = currentInfo(); + + // Call the "start" method of the handler, store the result of the start + // method as the validity of the handler -- do not call the start method + // if the stack is currently invalid (as this may cause further, + // unwanted errors) + bool validStack = handlersValid(); + info.valid = false; + if (validStack) { + handler->setLogger(loggerFork); + try { + info.valid = handler->start(canonicalArgs); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + handler->resetLogger(); + } + + // We started the command within an implicit default field and it is not + // valid -- remove both the new handler and the parent field from the + // stack + if (!info.valid && parentInfo.inImplicitDefaultField) { + endCurrentHandler(); + endCurrentHandler(); + continue; + } + + // If we ended up here, starting the command may or may not have worked, + // but after all, we cannot unroll the stack any further. Update the + // "valid" flag, commit any potential error messages and return. + info.valid = parentInfo.valid && info.valid; + loggerFork.commit(); + return; + } +} + +void Stack::data(const Variant &data) +{ + while (true) { + // Check whether there is any command the data can be sent to + if (stack.empty()) { + throw LoggableException("No command here to receive data."); + } + + // Fetch the current command handler information + HandlerInfo &info = currentInfo(); + + // Make sure the current handler has an open field + if (!ensureHandlerIsInField()) { + endCurrentHandler(); + continue; + } + + // If this field should not get any data, log an error and do not call + // the "data" handler + if (!info.inValidField) { + logger().error("Did not expect any data here", data); + } + + if (handlersValid() && info.inValidField) { + // Fork the logger and set it as temporary logger for the "start" + // method. We only want to keep error messages if this was not a try + // to implicitly open a default field. + LoggerFork loggerFork = logger().fork(); + info.handler->setLogger(loggerFork); + + // Pass the data to the current Handler instance + bool valid = false; + try { + valid = info.handler->data(data); + } + catch (LoggableException ex) { + loggerFork.log(ex); + } + + // Reset the logger instance as soon as possible + info.handler->resetLogger(); + + // If placing the data here failed and we're currently in an + // implicitly opened field, just unroll the stack to the next field + // and try again + if (!valid && info.inImplicitDefaultField) { + endCurrentHandler(); + continue; + } + + // Commit the content of the logger fork. Do not change the valid + // flag. + loggerFork.commit(); + } + + // There was no reason to unroll the stack any further, so continue + return; + } +} + +void Stack::fieldStart(bool isDefault) +{ + // Make sure the current handler stack is not empty + if (stack.empty()) { + throw LoggableException( + "No command for which a field could be started"); + } + + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (info.inField) { + logger().error( + "Got field start, but there is no command for which to start the " + "field."); + return; + } + + // Copy the isDefault flag to a local variable, the fieldStart method will + // write into this variable + bool defaultField = isDefault; + + // Do not call the "fieldStart" function if we're in an invalid subtree + bool valid = false; + if (handlersValid()) { + try { + valid = info.handler->fieldStart(defaultField, info.fieldIdx); + } + catch (LoggableException ex) { + logger().log(ex); + } + if (!valid && !defaultField) { + logger().error( + std::string("Cannot start a new field here (index ") + + std::to_string(info.fieldIdx + 1) + + std::string("), field does not exist")); + } + } + + // Mark the field as started + info.fieldStart(defaultField, false, valid); +} + +void Stack::fieldEnd() +{ + // Make sure the current handler stack is not empty + if (stack.empty()) { + throw LoggableException("No command for which a field could be ended"); + } + + // Fetch the information attached to the current handler + HandlerInfo &info = currentInfo(); + if (!info.inField) { + logger().error( + "Got field end, but there is no command for which to end the " + "field."); + return; + } + + // Only continue if the current handler stack is in a valid state, do not + // call the fieldEnd function if something went wrong before + if (handlersValid()) { + try { + info.handler->fieldEnd(); + } + catch (LoggableException ex) { + logger().log(ex); + } + } + + // This command no longer is in a field + info.fieldEnd(); + + // As soon as this command had a default field, remove it from the stack + if (info.hadDefaultField) { + endCurrentHandler(); + } +} + +void Stack::annotationStart(const Variant &className, const Variant &args) +{ + // TODO +} + +void Stack::annotationEnd(const Variant &className, const Variant &elementName) +{ + // TODO +} + +void Stack::token(Variant token) +{ + // TODO +} +} +} + diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp new file mode 100644 index 0000000..76eefd9 --- /dev/null +++ b/src/core/parser/stack/Stack.hpp @@ -0,0 +1,341 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Stack.hpp + * + * Helper classes for document or description parsers. Contains the + * Stack class, which is an pushdown automaton responsible for + * accepting commands in the correct order and calling specified handlers. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_PARSER_STACK_STACK_HPP_ +#define _OUSIA_PARSER_STACK_STACK_HPP_ + +#include <cstdint> + +#include <map> +#include <memory> +#include <set> +#include <vector> + +#include <core/common/Variant.hpp> +#include <core/parser/Parser.hpp> + +namespace ousia { + +// Forward declarations +class ParserContext; +class Logger; + +namespace parser_stack { + +// Forward declarations +class Handler; +class State; + +/** + * The HandlerInfo class is used internally by the stack to associate additional + * (mutable) data with a handler instance. + */ +class HandlerInfo { +public: + /** + * Pointer pointing at the actual handler instance. + */ + std::shared_ptr<Handler> handler; + + /** + * Next field index to be passed to the "fieldStart" function of the Handler + * class. + */ + size_t fieldIdx; + + /** + * Set to true if the handler is valid (which is the case if the "start" + * method has returned true). If the handler is invalid, no more calls are + * directed at it until it can be removed from the stack. + */ + bool valid : 1; + + /** + * Set to true if this is an implicit handler, that was created when the + * current stack state was deduced. + */ + bool implicit : 1; + + /** + * Set to true if the handler currently is in a field. + */ + bool inField : 1; + + /** + * Set to true if the handler currently is in the default field. + */ + bool inDefaultField : 1; + + /** + * Set to true if the handler currently is in an implicitly started default + * field. + */ + bool inImplicitDefaultField : 1; + + /** + * Set to false if this field is only opened pro-forma and does not accept + * any data. Otherwise set to true. + */ + bool inValidField : 1; + + /** + * Set to true, if the default field was already started. + */ + bool hadDefaultField : 1; + + /** + * Default constructor of the HandlerInfo class. + */ + HandlerInfo(); + /** + * Constructor of the HandlerInfo class, allows to set all flags manually. + */ + HandlerInfo(bool valid, bool implicit, bool inField, bool inDefaultField, + bool inImplicitDefaultField, bool inValidField); + + /** + * Constructor of the HandlerInfo class, taking a shared_ptr to the handler + * to which additional information should be attached. + */ + HandlerInfo(std::shared_ptr<Handler> handler); + + /** + * Destructor of the HandlerInfo class (to allow Handler to be forward + * declared). + */ + ~HandlerInfo(); + + /** + * Updates the "field" flags according to a "fieldStart" event. + */ + void fieldStart(bool isDefault, bool isImplicit, bool isValid); + + /** + * Updates the "fields" flags according to a "fieldEnd" event. + */ + void fieldEnd(); +}; + +/** + * The Stack class is a pushdown automaton responsible for turning a command + * stream into a tree of Node instances. It does so by following a state + * transition graph and creating a set of Handler instances, which are placed + * on the stack. + */ +class Stack { +private: + /** + * Reference at the parser context. + */ + ParserContext &ctx; + + /** + * Map containing all registered command names and the corresponding + * state descriptors. + */ + const std::multimap<std::string, const State *> &states; + + /** + * Internal stack used for managing the currently active Handler instances. + */ + std::vector<HandlerInfo> stack; + + /** + * Return the reference in the Logger instance stored within the context. + */ + Logger &logger(); + + /** + * Used internally to get all expected command names for the current state. + * This function is used to build error messages. + * + * @return a set of strings containing the names of the expected commands. + */ + std::set<std::string> expectedCommands(); + + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetState(const std::string &name); + + /** + * Returns the targetState for a command with the given name that can be + * reached from the current state, also including the wildcard "*" state. + * Throws an exception if the given target state is not a valid identifier. + * + * @param name is the name of the requested command. + * @return nullptr if no target state was found, a pointer at the target + * state otherwise. + */ + const State *findTargetStateOrWildcard(const std::string &name); + + /** + * Tries to reconstruct the parser state from the Scope instance of the + * ParserContext given in the constructor. This functionality is needed for + * including files,as the Parser of the included file needs to be brought to + * an equivalent state as the one in the including file. + */ + void deduceState(); + + /** + * Returns a reference at the current HandlerInfo instance (or a stub + * HandlerInfo instance if the stack is empty). + */ + HandlerInfo ¤tInfo(); + + /** + * Returns a reference at the last HandlerInfo instance (or a stub + * HandlerInfo instance if the stack has only one element). + */ + HandlerInfo &lastInfo(); + + /** + * Ends the current handler and removes the corresponding element from the + * stack. + */ + void endCurrentHandler(); + + /** + * Tries to start a default field for the current handler, if currently the + * handler is not inside a field and did not have a default field yet. + * + * @return true if the handler is inside a field, false if no field could + * be started. + */ + bool ensureHandlerIsInField(); + + /** + * Returns true if all handlers on the stack are currently valid, or false + * if at least one handler is invalid. + * + * @return true if all handlers on the stack are valid. + */ + bool handlersValid(); + +public: + /** + * Creates a new instance of the Stack class. + * + * @param ctx is the parser context the parser stack is working on. + * @param states is a map containing the command names and pointers at the + * corresponding State instances. + */ + Stack(ParserContext &ctx, + const std::multimap<std::string, const State *> &states); + + /** + * Destructor of the Stack class. + */ + ~Stack(); + + /** + * Returns the state the Stack instance currently is in. + * + * @return the state of the currently active Handler instance or STATE_NONE + * if no handler is on the stack. + */ + const State ¤tState(); + + /** + * Returns the command name that is currently being handled. + * + * @return the name of the command currently being handled by the active + * Handler instance or an empty string if no handler is currently active. + */ + std::string currentCommandName(); + + /** + * Function that should be called whenever a new command is reached. + * + * @param name is the name of the command (including the namespace + * separator ':') and its corresponding location. Must be a string variant. + * @param args is a map containing the arguments that were passed to the + * command. + */ + void command(const Variant &name, const Variant::mapType &args); + + /** + * Function that shuold be called whenever character data is found in the + * input stream. May only be called if the currently is a command on the + * stack. + * + * @param data is a string variant containing the data that has been found. + */ + void data(const Variant &data); + + /** + * Function that should be called whenever a new field starts. Fields of the + * same command may not be separated by calls to data or annotations. Doing + * so will result in a LoggableException. + * + * @param isDefault should be set to true if the started field explicitly + * is the default field. + */ + void fieldStart(bool isDefault); + + /** + * Function that should be called whenever a field ends. Calling this + * function if there is no field to end will result in a LoggableException. + */ + void fieldEnd(); + + /** + * Function that should be called whenever an annotation starts. + * + * @param name is the name of the annotation class. + * @param args is a map variant containing the arguments that were passed + * to the annotation. + */ + void annotationStart(const Variant &className, const Variant &args); + + /** + * Function that should be called whenever an annotation ends. + * + * @param name is the name of the annotation class that was ended. + * @param annotationName is the name of the annotation that was ended. + */ + void annotationEnd(const Variant &className, const Variant &elementName); + + /** + * Function that should be called whenever a previously registered token + * is found in the input stream. + * + * @param token is string variant containing the token that was encountered. + */ + void token(Variant token); +}; +} +} + +#endif /* _OUSIA_STACK_HPP_ */ + diff --git a/src/core/parser/ParserState.cpp b/src/core/parser/stack/State.cpp index f635d86..d72f533 100644 --- a/src/core/parser/ParserState.cpp +++ b/src/core/parser/stack/State.cpp @@ -16,88 +16,97 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include "ParserState.hpp" +#include "State.hpp" namespace ousia { +namespace parser_stack { -/* Class ParserState */ +/* Class State */ -ParserState::ParserState() : elementHandler(nullptr) {} +State::State() : elementHandler(nullptr) {} -ParserState::ParserState(ParserStateSet parents, Arguments arguments, +State::State(StateSet parents, Arguments arguments, RttiSet createdNodeTypes, - HandlerConstructor elementHandler) + HandlerConstructor elementHandler, + bool supportsAnnotations) : parents(parents), arguments(arguments), createdNodeTypes(createdNodeTypes), - elementHandler(elementHandler) + elementHandler(elementHandler), + supportsAnnotations(supportsAnnotations) { } -ParserState::ParserState(const ParserStateBuilder &builder) - : ParserState(builder.build()) +State::State(const StateBuilder &builder) + : State(builder.build()) { } -/* Class ParserStateBuilder */ +/* Class StateBuilder */ -ParserStateBuilder &ParserStateBuilder::copy(const ParserState &state) +StateBuilder &StateBuilder::copy(const State &state) { this->state = state; return *this; } -ParserStateBuilder &ParserStateBuilder::parent(const ParserState *parent) +StateBuilder &StateBuilder::parent(const State *parent) { - state.parents = ParserStateSet{parent}; + state.parents = StateSet{parent}; return *this; } -ParserStateBuilder &ParserStateBuilder::parents(const ParserStateSet &parents) +StateBuilder &StateBuilder::parents(const StateSet &parents) { state.parents = parents; return *this; } -ParserStateBuilder &ParserStateBuilder::arguments(const Arguments &arguments) +StateBuilder &StateBuilder::arguments(const Arguments &arguments) { state.arguments = arguments; return *this; } -ParserStateBuilder &ParserStateBuilder::createdNodeType(const Rtti *type) +StateBuilder &StateBuilder::createdNodeType(const Rtti *type) { state.createdNodeTypes = RttiSet{type}; return *this; } -ParserStateBuilder &ParserStateBuilder::createdNodeTypes(const RttiSet &types) +StateBuilder &StateBuilder::createdNodeTypes(const RttiSet &types) { state.createdNodeTypes = types; return *this; } -ParserStateBuilder &ParserStateBuilder::elementHandler( +StateBuilder &StateBuilder::elementHandler( HandlerConstructor elementHandler) { state.elementHandler = elementHandler; return *this; } -const ParserState &ParserStateBuilder::build() const { return state; } +StateBuilder &StateBuilder::supportsAnnotations(bool supportsAnnotations) +{ + state.supportsAnnotations = supportsAnnotations; + return *this; +} -/* Class ParserStateDeductor */ +const State &StateBuilder::build() const { return state; } -ParserStateDeductor::ParserStateDeductor( +/* Class StateDeductor */ + +StateDeductor::StateDeductor( std::vector<const Rtti *> signature, - std::vector<const ParserState *> states) + std::vector<const State *> states) : tbl(signature.size()), signature(std::move(signature)), states(std::move(states)) { } -bool ParserStateDeductor::isActive(size_t d, const ParserState *s) +bool StateDeductor::isActive(size_t d, const State *s) { // Lookup the "active" state of (d, s), if it was not already set // (e.second is true) we'll have to calculate it @@ -123,7 +132,7 @@ bool ParserStateDeductor::isActive(size_t d, const ParserState *s) // Check whether any of the parent nodes were active -- either for // the previous element (if this one is generative) or for the // current element (assuming this node was not generative) - for (const ParserState *parent : s->parents) { + for (const State *parent : s->parents) { if ((isGenerative && isActive(d - 1, parent)) || isActive(d, parent)) { res = true; @@ -136,9 +145,9 @@ bool ParserStateDeductor::isActive(size_t d, const ParserState *s) return res; } -std::vector<const ParserState *> ParserStateDeductor::deduce() +std::vector<const State *> StateDeductor::deduce() { - std::vector<const ParserState *> res; + std::vector<const State *> res; if (!signature.empty()) { const size_t D = signature.size(); for (auto s : states) { @@ -153,9 +162,10 @@ std::vector<const ParserState *> ParserStateDeductor::deduce() /* Constant initializations */ -namespace ParserStates { -const ParserState All; -const ParserState None; +namespace States { +const State All; +const State None; +} } } diff --git a/src/core/parser/ParserState.hpp b/src/core/parser/stack/State.hpp index 6487fdd..4766235 100644 --- a/src/core/parser/ParserState.hpp +++ b/src/core/parser/stack/State.hpp @@ -17,10 +17,10 @@ */ /** - * @file ParserState.hpp + * @file State.hpp * - * Defines the ParserState class used within the ParserStack pushdown - * automaton and the ParserStateBuilder class for convenient construction of + * Defines the State class used within the ParserStack pushdown + * automaton and the StateBuilder class for convenient construction of * such classes. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) @@ -33,12 +33,14 @@ #include <core/common/Rtti.hpp> #include <core/common/Argument.hpp> +#include <core/common/Whitespace.hpp> namespace ousia { +namespace parser_stack { // Forward declarations -class ParserStateBuilder; -class ParserState; +class StateBuilder; +class State; class HandlerData; class Handler; using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); @@ -47,17 +49,17 @@ using HandlerConstructor = Handler *(*)(const HandlerData &handlerData); * Set of pointers of parser states -- used for specifying a set of parent * states. */ -using ParserStateSet = std::unordered_set<const ParserState *>; +using StateSet = std::unordered_set<const State *>; /** - * Class used for the complete specification of a ParserState. Stores possible + * Class used for the complete specification of a State. Stores possible * parent states, state handlers and arguments to be passed to that state. */ -struct ParserState { +struct State { /** * Vector containing all possible parent states. */ - ParserStateSet parents; + StateSet parents; /** * Descriptor of the arguments that should be passed to the handler. @@ -66,8 +68,8 @@ struct ParserState { /** * Set containing the types of the nodes that may be created in this - * ParserState. This information is needed for Parsers to reconstruct the - * current ParserState from a given ParserScope when a file is included. + * State. This information is needed for Parsers to reconstruct the + * current State from a given ParserScope when a file is included. */ RttiSet createdNodeTypes; @@ -79,109 +81,119 @@ struct ParserState { HandlerConstructor elementHandler; /** + * Set to true if this handler does support annotations. This is almost + * always false (e.g. all description handlers), except for document + * element handlers. + */ + bool supportsAnnotations; + + /** * Default constructor, initializes the handlers with nullptr. */ - ParserState(); + State(); /** - * Constructor taking values for all fields. Use the ParserStateBuilder - * class for a more convenient construction of ParserState instances. + * Constructor taking values for all fields. Use the StateBuilder + * class for a more convenient construction of State instances. * * @param parents is a vector containing all possible parent states. * @param arguments is a descriptor of arguments that should be passed to * the handler. * @param createdNodeTypes is a set containing the types of the nodes tha - * may be created in this ParserState. This information is needed for - * Parsers to reconstruct the current ParserState from a given ParserScope + * may be created in this State. This information is needed for + * Parsers to reconstruct the current State from a given ParserScope * when a file is included. * @param elementHandler is a pointer at a function which creates a new * concrete Handler instance for the elements described by this state. May * be nullptr in which case no handler instance is created. + * @param supportsAnnotations specifies whether annotations are supported + * here at all. */ - ParserState(ParserStateSet parents, Arguments arguments = Arguments{}, + State(StateSet parents, Arguments arguments = Arguments{}, RttiSet createdNodeTypes = RttiSet{}, - HandlerConstructor elementHandler = nullptr); + HandlerConstructor elementHandler = nullptr, + bool supportsAnnotations = false); /** - * Creates this ParserState from the given ParserStateBuilder instance. + * Creates this State from the given StateBuilder instance. */ - ParserState(const ParserStateBuilder &builder); + State(const StateBuilder &builder); }; /** - * The ParserStateBuilder class is a class used for conveniently building new - * ParserState instances. + * The StateBuilder class is a class used for conveniently building new + * State instances. */ -class ParserStateBuilder { +class StateBuilder { private: /** - * ParserState instance that is currently being built by the - * ParserStateBuilder. + * State instance that is currently being built by the + * StateBuilder. */ - ParserState state; + State state; public: /** - * Copies the ParserState instance and uses it as internal state. Overrides - * all changes made by the ParserStateBuilder. + * Copies the State instance and uses it as internal state. Overrides + * all changes made by the StateBuilder. * * @param state is the state that should be copied. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder ©(const ParserState &state); + StateBuilder ©(const State &state); /** * Sets the possible parent states to the single given parent element. * - * @param parent is a pointer at the parent ParserState instance that should + * @param parent is a pointer at the parent State instance that should * be the possible parent state. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &parent(const ParserState *parent); + StateBuilder &parent(const State *parent); /** - * Sets the ParserState instances in the given ParserStateSet as the list of + * Sets the State instances in the given StateSet as the list of * supported parent states. * - * @param parents is a set of pointers at ParserState instances that should + * @param parents is a set of pointers at State instances that should * be the possible parent states. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &parents(const ParserStateSet &parents); + StateBuilder &parents(const StateSet &parents); /** * Sets the arguments that should be passed to the parser state handler to * those given as argument. * * @param arguments is the Arguments instance describing the Arguments that - * should be parsed to a Handler for this ParserState. - * @return a reference at this ParserStateBuilder instance for method + * should be parsed to a Handler for this State. + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &arguments(const Arguments &arguments); + StateBuilder &arguments(const Arguments &arguments); /** * Sets the Node types this state may produce to the given Rtti descriptor. * * @param type is the Rtti descriptor of the Type that may be produced by * this state. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &createdNodeType(const Rtti *type); + StateBuilder &createdNodeType(const Rtti *type); /** * Sets the Node types this state may produce to the given Rtti descriptors. * * @param types is a set of Rtti descriptors of the Types that may be * produced by this state. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &createdNodeTypes(const RttiSet &types); + StateBuilder &createdNodeTypes(const RttiSet &types); /** * Sets the constructor for the element handler. The constructor creates a @@ -191,31 +203,42 @@ public: * * @param elementHandler is the HandlerConstructor that should create a * new Handler instance. - * @return a reference at this ParserStateBuilder instance for method + * @return a reference at this StateBuilder instance for method * chaining. */ - ParserStateBuilder &elementHandler(HandlerConstructor elementHandler); + StateBuilder &elementHandler(HandlerConstructor elementHandler); /** - * Returns a reference at the internal ParserState instance that was built - * using the ParserStateBuilder. + * Sets the state of the "supportsAnnotations" flags (default value is + * false) * - * @return the built ParserState. + * @param supportsAnnotations should be set to true, if annotations are + * supported for the handlers associated with this document. + * @return a reference at this StateBuilder instance for method + * chaining. */ - const ParserState &build() const; + StateBuilder &supportsAnnotations(bool supportsAnnotations); + + /** + * Returns a reference at the internal State instance that was built + * using the StateBuilder. + * + * @return the built State. + */ + const State &build() const; }; /** - * Class used to deduce the ParserState a Parser is currently in based on the + * Class used to deduce the State a Parser is currently in based on the * types of the Nodes that currently are on the ParserStack. Uses dynamic * programming in order to solve this problem. */ -class ParserStateDeductor { +class StateDeductor { public: /** * Type containing the dynamic programming table. */ - using Table = std::vector<std::unordered_map<const ParserState *, bool>>; + using Table = std::vector<std::unordered_map<const State *, bool>>; private: /** @@ -231,7 +254,7 @@ private: /** * List of states that should be checked for being active. */ - const std::vector<const ParserState *> states; + const std::vector<const State *> states; /** * Used internally to check whether the given parser stack s may have been @@ -239,20 +262,20 @@ private: * * @param d is the signature element. * @param s is the parser state. - * @return true if the the given ParserState may have been active. + * @return true if the the given State may have been active. */ - bool isActive(size_t d, const ParserState *s); + bool isActive(size_t d, const State *s); public: /** - * Constructor of the ParserStateDeductor class. + * Constructor of the StateDeductor class. * * @param signature a Node type signature describing the types of the nodes * which currently reside on e.g. the ParserScope stack. * @param states is a list of states that should be checked. */ - ParserStateDeductor(std::vector<const Rtti *> signature, - std::vector<const ParserState *> states); + StateDeductor(std::vector<const Rtti *> signature, + std::vector<const State *> states); /** * Selects all active states from the given states. Only considers those @@ -260,23 +283,24 @@ public: * * @return a list of states that may actually have been active. */ - std::vector<const ParserState *> deduce(); + std::vector<const State *> deduce(); }; /** - * The ParserStates namespace contains all the global state constants used + * The States namespace contains all the global state constants used * in the ParserStack class. */ -namespace ParserStates { +namespace States { /** * State representing all states. */ -extern const ParserState All; +extern const State All; /** * State representing the initial state. */ -extern const ParserState None; +extern const State None; +} } } diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp new file mode 100644 index 0000000..4a0430b --- /dev/null +++ b/src/core/parser/utils/TokenTrie.cpp @@ -0,0 +1,119 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "TokenTrie.hpp" + +namespace ousia { + +/* Class DynamicTokenTree::Node */ + +TokenTrie::Node::Node() : type(EmptyToken) {} + +/* Class DynamicTokenTree */ + +bool TokenTrie::registerToken(const std::string &token, + TokenTypeId type) noexcept +{ + // Abort if the token is empty -- this would taint the root node + if (token.empty()) { + return false; + } + + // Iterate over each character in the given string and insert them as + // (new) nodes + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Insert a new node if this one does not exist + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + it = node->children.emplace(c, std::make_shared<Node>()).first; + } + node = it->second.get(); + } + + // If the resulting node already has a type set, we're screwed. + if (node->type != EmptyToken) { + return false; + } + + // Otherwise just set the type to the given type. + node->type = type; + return true; +} + +bool TokenTrie::unregisterToken(const std::string &token) noexcept +{ + // We cannot remove empty tokens as we need to access the fist character + // upfront + if (token.empty()) { + return false; + } + + // First pass -- search the node in the path that can be deleted + Node *subtreeRoot = &root; + char subtreeKey = token[0]; + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Go to the next node, abort if the tree ends unexpectedly + auto it = node->children.find(token[i]); + if (it == node->children.end()) { + return false; + } + + // Reset the subtree handler if this node has another type + node = it->second.get(); + if ((node->type != EmptyToken || node->children.size() > 1) && + (i + 1 != token.size())) { + subtreeRoot = node; + subtreeKey = token[i + 1]; + } + } + + // If the node type is already EmptyToken, we cannot do anything here + if (node->type == EmptyToken) { + return false; + } + + // If the target node has children, we cannot delete the subtree. Set the + // type to EmptyToken instead + if (!node->children.empty()) { + node->type = EmptyToken; + return true; + } + + // If we end up here, we can safely delete the complete subtree + subtreeRoot->children.erase(subtreeKey); + return true; +} + +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept +{ + Node const *node = &root; + for (size_t i = 0; i < token.size(); i++) { + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + return EmptyToken; + } + node = it->second.get(); + } + return node->type; +} +} + diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp new file mode 100644 index 0000000..36c2ffa --- /dev/null +++ b/src/core/parser/utils/TokenTrie.hpp @@ -0,0 +1,150 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file TokenTrie.hpp + * + * Class representing a token trie that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ + +#include <cstdint> +#include <memory> +#include <limits> +#include <unordered_map> + +namespace ousia { + +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; + +/** + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits<TokenTypeId>::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits<TokenTypeId>::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: + * + * \code{*.txt} + * ~ (0) + * / \ + * a (2) b (0) + * | | + * a (0) a (0) + * | | + * b (1) c (0) + * \endcode + * + * Where the number indicates the corresponding token descriptor identifier. + */ +class TokenTrie { +public: + /** + * Structure used to build the node tree. + */ + struct Node { + /** + * Type used for the child map. + */ + using ChildMap = std::unordered_map<char, std::shared_ptr<Node>>; + + /** + * Map from single characters at the corresponding child nodes. + */ + ChildMap children; + + /** + * Reference at the corresponding token descriptor. Set to nullptr if + * no token is attached to this node. + */ + TokenTypeId type; + + /** + * Default constructor, initializes the descriptor with nullptr. + */ + Node(); + }; + +private: + /** + * Root node of the internal token tree. + */ + Node root; + +public: + /** + * Registers a token containing the given string. Returns false if the + * token already exists, true otherwise. + * + * @param token is the character sequence that should be registered as + * token. + * @param type is the descriptor that should be set for this token. + * @return true if the operation is successful, false otherwise. + */ + bool registerToken(const std::string &token, TokenTypeId type) noexcept; + + /** + * Unregisters the token from the token tree. Returns true if the token was + * unregistered successfully, false otherwise. + * + * @param token is the character sequence that should be unregistered. + * @return true if the operation was successful, false otherwise. + */ + bool unregisterToken(const std::string &token) noexcept; + + /** + * Returns true, if the given token exists within the TokenTree. This + * function is mostly thought for debugging and unit testing. + * + * @param token is the character sequence that should be searched. + * @return the attached token descriptor or nullptr if the given token is + * not found. + */ + TokenTypeId hasToken(const std::string &token) const noexcept; + + /** + * Returns a reference at the root node to be used for traversing the token + * tree. + * + * @return a reference at the root node. + */ + const Node *getRoot() const noexcept { return &root; } +}; +} + +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ + diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp new file mode 100644 index 0000000..3c8177d --- /dev/null +++ b/src/core/parser/utils/Tokenizer.cpp @@ -0,0 +1,381 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <memory> +#include <vector> + +#include <core/common/CharReader.hpp> +#include <core/common/Exceptions.hpp> +#include <core/common/Utils.hpp> +#include <core/common/WhitespaceHandler.hpp> + +#include "Tokenizer.hpp" + +namespace ousia { + +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { + /** + * Token that was matched. + */ + Token token; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + + /** + * Constructor of the TokenMatch class. + */ + TokenMatch() : textLength(0), textEnd(0) {} + + /** + * Returns true if this TokenMatch instance actually represents a match. + */ + bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: + /** + * Current node within the token trie. + */ + TokenTrie::Node const *node; + + /** + * Start offset within the source file. + */ + size_t start; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + +public: + /** + * Constructor of the TokenLookup class. + * + * @param node is the current node. + * @param start is the start position. + * @param textLength is the text buffer length of the previous text token. + * @param textEnd is the current end location of the previous text token. + */ + TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, + size_t textEnd) + : node(node), start(start), textLength(textLength), textEnd(textEnd) + { + } + + /** + * Tries to extend the current path in the token trie with the given + * character. If a complete token is matched, stores this match in the + * tokens list (in case it is longer than any previous token). + * + * @param c is the character that should be appended to the current prefix. + * @param lookups is a list to which new TokeLookup instances are added -- + * which could potentially be expanded in the next iteration. + * @param match is the Token instance to which the matching token + * should be written. + * @param tokens is a reference at the internal token list of the + * Tokenizer. + * @param end is the end byte offset of the current character. + * @param sourceId is the source if of this file. + */ + void advance(char c, std::vector<TokenLookup> &lookups, TokenMatch &match, + const std::vector<std::string> &tokens, SourceOffset end, + SourceId sourceId) + { + // Check whether we can continue the current token path with the given + // character without visiting an already visited node + auto it = node->children.find(c); + if (it == node->children.end()) { + return; + } + + // Check whether the new node represents a complete token a whether it + // is longer than the current token. If yes, replace the current token. + node = it->second.get(); + if (node->type != EmptyToken) { + const std::string &str = tokens[node->type]; + size_t len = str.size(); + if (len > match.token.content.size()) { + match.token = + Token{node->type, str, {sourceId, start, end}}; + match.textLength = textLength; + match.textEnd = textEnd; + } + } + + // If this state can possibly be advanced, store it in the states list. + if (!node->children.empty()) { + lookups.emplace_back(*this); + } + } +}; + +/** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param handler is the WhitespaceHandler containing the collected data. + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ +static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, + SourceId sourceId) +{ + if (match.hasMatch()) { + match.token.content = + std::string{handler.textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, handler.textStart, match.textEnd}; + } else { + match.token.content = handler.toString(); + match.token.location = + SourceLocation{sourceId, handler.textStart, handler.textEnd}; + } + match.token.type = TextToken; +} +} + +/* Class Tokenizer */ + +Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) + : whitespaceMode(whitespaceMode), nextTokenTypeId(0) +{ +} + +template <typename TextHandler, bool read> +bool Tokenizer::next(CharReader &reader, Token &token) +{ + // If we're in the read mode, reset the char reader peek position to the + // current read position + if (read) { + reader.resetPeek(); + } + + // Prepare the lookups in the token trie + const TokenTrie::Node *root = trie.getRoot(); + TokenMatch match; + std::vector<TokenLookup> lookups; + std::vector<TokenLookup> nextLookups; + + // Instantiate the text handler + TextHandler textHandler; + + // Peek characters from the reader and try to advance the current token tree + // cursor + char c; + size_t charStart = reader.getPeekOffset(); + const SourceId sourceId = reader.getSourceId(); + while (reader.peek(c)) { + const size_t charEnd = reader.getPeekOffset(); + const size_t textLength = textHandler.textBuf.size(); + const size_t textEnd = textHandler.textEnd; + + // If we do not have a match yet, start a new lookup from the root + if (!match.hasMatch()) { + TokenLookup{root, charStart, textLength, textEnd}.advance( + c, nextLookups, match, tokens, charEnd, sourceId); + } + + // Try to advance all other lookups with the new character + for (TokenLookup &lookup : lookups) { + lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + } + + // We have found a token and there are no more states to advance or the + // text handler has found something -- abort to return the new token + if (match.hasMatch()) { + if ((nextLookups.empty() || textHandler.hasText())) { + break; + } + } else { + // Record all incomming characters + textHandler.append(c, charStart, charEnd); + } + + // Swap the lookups and the nextLookups list + lookups = std::move(nextLookups); + nextLookups.clear(); + + // Advance the offset + charStart = charEnd; + } + + // If we found text, emit that text + if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { + buildTextToken(textHandler, match, sourceId); + } + + // Move the read/peek cursor to the end of the token, abort if an error + // happens while doing so + if (match.hasMatch()) { + // Make sure we have a valid location + if (match.token.location.getEnd() == InvalidSourceOffset) { + throw OusiaException{"Token end position offset out of range"}; + } + + // Seek to the end of the current token + const size_t end = match.token.location.getEnd(); + if (read) { + reader.seek(end); + } else { + reader.seekPeekCursor(end); + } + token = match.token; + } else { + token = Token{}; + } + return match.hasMatch(); +} + +bool Tokenizer::read(CharReader &reader, Token &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next<PreservingWhitespaceHandler, true>(reader, token); + case WhitespaceMode::TRIM: + return next<TrimmingWhitespaceHandler, true>(reader, token); + case WhitespaceMode::COLLAPSE: + return next<CollapsingWhitespaceHandler, true>(reader, token); + } + return false; +} + +bool Tokenizer::peek(CharReader &reader, Token &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next<PreservingWhitespaceHandler, false>(reader, token); + case WhitespaceMode::TRIM: + return next<TrimmingWhitespaceHandler, false>(reader, token); + case WhitespaceMode::COLLAPSE: + return next<CollapsingWhitespaceHandler, false>(reader, token); + } + return false; +} + +TokenTypeId Tokenizer::registerToken(const std::string &token) +{ + // Abort if an empty token should be registered + if (token.empty()) { + return EmptyToken; + } + + // Search for a new slot in the tokens list + TokenTypeId type = EmptyToken; + for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + if (tokens[i].empty()) { + tokens[i] = token; + type = i; + break; + } + } + + // No existing slot was found, add a new one -- make sure we do not + // override the special token type handles + if (type == EmptyToken) { + type = tokens.size(); + if (type == TextToken || type == EmptyToken) { + throw OusiaException{"Token type ids depleted!"}; + } + tokens.emplace_back(token); + } + nextTokenTypeId = type + 1; + + // Try to register the token in the trie -- if this fails, remove it + // from the tokens list + if (!trie.registerToken(token, type)) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return EmptyToken; + } + return type; +} + +bool Tokenizer::unregisterToken(TokenTypeId type) +{ + // Unregister the token from the trie, abort if an invalid type is given + if (type < tokens.size() && trie.unregisterToken(tokens[type])) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return true; + } + return false; +} + +std::string Tokenizer::getTokenString(TokenTypeId type) +{ + if (type < tokens.size()) { + return tokens[type]; + } + return std::string{}; +} + +void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} + +WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } + +/* Explicitly instantiate all possible instantiations of the "next" member + function */ +template bool Tokenizer::next<PreservingWhitespaceHandler, false>( + CharReader &reader, Token &token); +template bool Tokenizer::next<TrimmingWhitespaceHandler, false>( + CharReader &reader, Token &token); +template bool Tokenizer::next<CollapsingWhitespaceHandler, false>( + CharReader &reader, Token &token); +template bool Tokenizer::next<PreservingWhitespaceHandler, true>( + CharReader &reader, Token &token); +template bool Tokenizer::next<TrimmingWhitespaceHandler, true>( + CharReader &reader, Token &token); +template bool Tokenizer::next<CollapsingWhitespaceHandler, true>( + CharReader &reader, Token &token); +} + diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp new file mode 100644 index 0000000..6b4e116 --- /dev/null +++ b/src/core/parser/utils/Tokenizer.hpp @@ -0,0 +1,231 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file Tokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include <set> +#include <string> +#include <vector> + +#include <core/common/Location.hpp> +#include <core/common/Whitespace.hpp> + +#include "TokenTrie.hpp" + +namespace ousia { + +// Forward declarations +class CharReader; + +/** + * The Token structure describes a token discovered by the Tokenizer. + */ +struct Token { + /** + * Id of the type of this token. + */ + TokenTypeId type; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + Token() : type(EmptyToken) {} + + /** + * Constructor of the Token struct. + * + * @param id represents the token type. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + Token(TokenTypeId type, const std::string &content, + SourceLocation location) + : type(type), content(content), location(location) + { + } + + /** + * Constructor of the Token struct, only initializes the token type + * + * @param type is the id corresponding to the type of the token. + */ + Token(TokenTypeId type) : type(type) {} + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; + +/** + * The Tokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. Note that the + * Tokenizer always tries to extract the longest possible token from the + * tokenizer. + */ +class Tokenizer { +private: + /** + * Internally used token trie. This object holds all registered tokens. + */ + TokenTrie trie; + + /** + * Flag defining whether whitespaces should be preserved or not. + */ + WhitespaceMode whitespaceMode; + + /** + * Vector containing all registered token types. + */ + std::vector<std::string> tokens; + + /** + * Next index in the tokens list where to search for a new token id. + */ + size_t nextTokenTypeId; + + /** + * Templated function used internally to read the current token. The + * function is templated in order to force code generation for all six + * combiations of whitespace modes and reading/peeking. + * + * @tparam TextHandler is the type to be used for the textHandler instance. + * @tparam read specifies whether the function should start from and advance + * the read pointer of the char reader. + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is the token structure into which the token information + * should be written. + * @return false if the end of the stream has been reached, true otherwise. + */ + template <typename TextHandler, bool read> + bool next(CharReader &reader, Token &token); + +public: + /** + * Constructor of the Tokenizer class. + * + * @param whitespaceMode specifies how whitespace should be handled. + */ + Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + + /** + * Registers the given string as a token. Returns a const pointer at a + * TokenDescriptor that will be used to reference the newly created token. + * + * @param token is the token string that should be registered. + * @return a unique identifier for the registered token or EmptyToken if + * an error occured. + */ + TokenTypeId registerToken(const std::string &token); + + /** + * Unregisters the token belonging to the given TokenTypeId. + * + * @param type is the token type that should be unregistered. The + *TokenTypeId + * must have been returned by registerToken. + * @return true if the operation was successful, false otherwise (e.g. + * because the given TokenDescriptor was already unregistered). + */ + bool unregisterToken(TokenTypeId type); + + /** + * Returns the token that was registered under the given TokenTypeId id or + *an + * empty string if an invalid TokenTypeId id is given. + * + * @param type is the TokenTypeId id for which the corresponding token + *string + * should be returned. + * @return the registered token string or an empty string if the given type + * was invalid. + */ + std::string getTokenString(TokenTypeId type); + + /** + * Sets the whitespace mode. + * + * @param whitespaceMode defines how whitespace should be treated in text + * tokens. + */ + void setWhitespaceMode(WhitespaceMode mode); + + /** + * Returns the current value of the whitespace mode. + * + * @return the whitespace mode. + */ + WhitespaceMode getWhitespaceMode(); + + /** + * Reads a new token from the CharReader and stores it in the given + * Token instance. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool read(CharReader &reader, Token &token); + + /** + * The peek method does not advance the read position of the char reader, + * but reads the next token from the current char reader peek position. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool peek(CharReader &reader, Token &token); +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + |