From 5a67fc7d682ddba6a862aacf616d02cd20b727eb Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Tue, 24 Feb 2015 02:13:46 +0100 Subject: start of branch, commit log will be rewritten --- src/core/parser/stack/DocumentHandler.cpp | 24 ++- src/core/parser/stack/DocumentHandler.hpp | 4 +- src/core/parser/stack/Handler.cpp | 25 ++- src/core/parser/stack/Handler.hpp | 74 +++++--- src/core/parser/stack/Stack.cpp | 55 ++++-- src/core/parser/stack/Stack.hpp | 18 +- src/core/parser/utils/SourceOffsetVector.hpp | 28 ++- src/core/parser/utils/Token.cpp | 24 --- src/core/parser/utils/Token.hpp | 142 -------------- src/core/parser/utils/TokenTrie.cpp | 16 +- src/core/parser/utils/TokenTrie.hpp | 11 +- src/core/parser/utils/TokenizedData.cpp | 133 +++++++++++-- src/core/parser/utils/TokenizedData.hpp | 214 +++++++++++++++++---- src/core/parser/utils/Tokenizer.cpp | 271 ++++++++++++--------------- src/core/parser/utils/Tokenizer.hpp | 140 ++++++++------ 15 files changed, 680 insertions(+), 499 deletions(-) delete mode 100644 src/core/parser/utils/Token.cpp delete mode 100644 src/core/parser/utils/Token.hpp (limited to 'src/core/parser') diff --git a/src/core/parser/stack/DocumentHandler.cpp b/src/core/parser/stack/DocumentHandler.cpp index bb04bd3..d44176a 100644 --- a/src/core/parser/stack/DocumentHandler.cpp +++ b/src/core/parser/stack/DocumentHandler.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -372,8 +373,15 @@ bool DocumentChildHandler::convertData(Handle field, return valid && scope().resolveValue(data, type, logger); } -bool DocumentChildHandler::data(Variant &data) +bool DocumentChildHandler::data(TokenizedData &data) { + // TODO: Handle this correctly + Variant text = data.text(WhitespaceMode::TRIM); + if (text == nullptr) { + // For now, except "no data" as success + return true; + } + // We're past the region in which explicit fields can be defined in the // parent structure element scope().setFlag(ParserFlag::POST_EXPLICIT_FIELDS, true); @@ -393,11 +401,11 @@ bool DocumentChildHandler::data(Variant &data) // If it is a primitive field directly, try to parse the content. if (field->isPrimitive()) { // Add it as primitive content. - if (!convertData(field, data, logger())) { + if (!convertData(field, text, logger())) { return false; } - parent->createChildDocumentPrimitive(data, fieldIdx); + parent->createChildDocumentPrimitive(text, fieldIdx); return true; } @@ -411,7 +419,7 @@ bool DocumentChildHandler::data(Variant &data) for (auto primitiveField : defaultFields) { // Then try to parse the content using the type specification. forks.emplace_back(logger().fork()); - if (!convertData(primitiveField, data, forks.back())) { + if (!convertData(primitiveField, text, forks.back())) { continue; } @@ -424,7 +432,7 @@ bool DocumentChildHandler::data(Variant &data) createPath(fieldIdx, path, parent); // Then create the primitive element - parent->createChildDocumentPrimitive(data); + parent->createChildDocumentPrimitive(text); return true; } @@ -434,10 +442,10 @@ bool DocumentChildHandler::data(Variant &data) if (defaultFields.empty()) { logger().error("Got data, but structure \"" + name() + "\" does not have any primitive field", - data); + text); } else { logger().error("Could not read data with any of the possible fields:", - data); + text); size_t f = 0; for (auto field : defaultFields) { logger().note(std::string("Field ") + @@ -471,4 +479,4 @@ namespace RttiTypes { const Rtti DocumentField = RttiBuilder( "DocumentField").parent(&Node); } -} \ No newline at end of file +} diff --git a/src/core/parser/stack/DocumentHandler.hpp b/src/core/parser/stack/DocumentHandler.hpp index 862081c..dda7d8b 100644 --- a/src/core/parser/stack/DocumentHandler.hpp +++ b/src/core/parser/stack/DocumentHandler.hpp @@ -167,7 +167,7 @@ public: bool start(Variant::mapType &args) override; void end() override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; bool fieldStart(bool &isDefault, size_t fieldIdx) override; @@ -213,4 +213,4 @@ extern const Rtti DocumentField; } } -#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ \ No newline at end of file +#endif /* _OUSIA_PARSER_STACK_DOCUMENT_HANDLER_HPP_ */ diff --git a/src/core/parser/stack/Handler.cpp b/src/core/parser/stack/Handler.cpp index bf5d4ea..3d413e8 100644 --- a/src/core/parser/stack/Handler.cpp +++ b/src/core/parser/stack/Handler.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include "Callbacks.hpp" @@ -130,7 +131,7 @@ bool EmptyHandler::annotationEnd(const Variant &className, return true; } -bool EmptyHandler::data(Variant &data) +bool EmptyHandler::data(TokenizedData &data) { // Support any data return true; @@ -184,10 +185,13 @@ bool StaticHandler::annotationEnd(const Variant &className, return false; } -bool StaticHandler::data(Variant &data) +bool StaticHandler::data(TokenizedData &data) { - logger().error("Did not expect any data here", data); - return false; + if (data.text(WhitespaceMode::TRIM) != nullptr) { + logger().error("Did not expect any data here", data); + return false; + } + return true; } /* Class StaticFieldHandler */ @@ -227,12 +231,19 @@ void StaticFieldHandler::end() } } -bool StaticFieldHandler::data(Variant &data) +bool StaticFieldHandler::data(TokenizedData &data) { + Variant text = data.text(WhitespaceMode::TRIM); + if (text == nullptr) { + // Providing no data here is ok as long as the "doHandle" callback + // function has already been called + return handled; + } + // Call the doHandle function if this has not been done before if (!handled) { handled = true; - doHandle(data, args); + doHandle(text, args); return true; } @@ -240,7 +251,7 @@ bool StaticFieldHandler::data(Variant &data) logger().error( std::string("Found data, but the corresponding argument \"") + argName + std::string("\" was already specified"), - data); + text); // Print the location at which the attribute was originally specified auto it = args.find(argName); diff --git a/src/core/parser/stack/Handler.hpp b/src/core/parser/stack/Handler.hpp index 7cda7a4..929466d 100644 --- a/src/core/parser/stack/Handler.hpp +++ b/src/core/parser/stack/Handler.hpp @@ -31,6 +31,7 @@ namespace ousia { class ParserScope; class ParserContext; class Logger; +class TokenizedData; namespace parser_stack { @@ -158,40 +159,63 @@ protected: */ const std::string &name() const; -public: - /** - * Virtual destructor. - */ - virtual ~Handler(); - /** * Calls the corresponding function in the Callbacks instance. Sets the * whitespace mode that specifies how string data should be processed. The * calls to this function are placed on a stack by the underlying Stack - * class. + * class. This function should be called from the "fieldStart" callback and + * the "start" callback. If no whitespace mode is pushed in the "start" + * method the whitespace mode "TRIM" is implicitly assumed. * * @param whitespaceMode specifies one of the three WhitespaceMode constants * PRESERVE, TRIM or COLLAPSE. */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); + void pushWhitespaceMode(WhitespaceMode whitespaceMode); /** - * Calls the corresponding function in the Callbacks instance. - * Registers the given token as token that should be reported to the handler - * using the "token" function. - * - * @param token is the token string that should be reported. + * Pops a previously pushed whitespace mode. Calls to this function should + * occur in the "end" callback and the "fieldEnd" callback. This function + * can only undo pushs that were performed by the pushWhitespaceMode() + * method of the same handler. */ - void registerToken(const std::string &token); + void popWhitespaceMode(); /** - * Calls the corresponding function in the Callbacks instance. - * Unregisters the given token, it will no longer be reported to the handler - * using the "token" function. + * Calls the corresponding function in the Callbacks instance. Sets the + * whitespace mode that specifies how string data should be processed. The + * calls to this function are placed on a stack by the underlying Stack + * class. This function should be called from the "fieldStart" callback and + * the "start" callback. If no whitespace mode is pushed in the "start" + * method the whitespace mode "TRIM" is implicitly assumed. * - * @param token is the token string that should be unregistered. + * @param tokens is a list of tokens that should be reported to this handler + * instance via the "token" method. */ - void unregisterToken(const std::string &token); + void pushTokens(const std::vector &tokens); + + /** + * Pops a previously pushed whitespace mode. Calls to this function should + * occur in the "end" callback and the "fieldEnd" callback. This function + * can only undo pushs that were performed by the pushWhitespaceMode() + * method of the same handler. + */ + void popWhitespaceMode(); + + + /** + * Calls the corresponding function in the Callbacks instance. This method + * registers the given tokens as tokens that are generally available, tokens + * must be explicitly enabled using the "pushTokens" and "popTokens" method. + * Tokens that have not been registered are not guaranteed to be reported, + * even though they are + */ + void registerTokens(const std::vector &tokens); + +public: + /** + * Virtual destructor. + */ + virtual ~Handler(); /** * Returns the command name for which the handler was created. @@ -299,11 +323,11 @@ public: * Handler instance. Should return true if the data could be handled, false * otherwise. * - * @param data is a string variant containing the character data and its - * location. + * @param data is an instance of TokenizedData containing the segmented + * character data and its location. * @return true if the data could be handled, false otherwise. */ - virtual bool data(Variant &data) = 0; + virtual bool data(TokenizedData &data) = 0; }; /** @@ -333,7 +357,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; /** * Creates an instance of the EmptyHandler class. @@ -359,7 +383,7 @@ public: Variant::mapType &args) override; bool annotationEnd(const Variant &className, const Variant &elementName) override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; }; /** @@ -412,7 +436,7 @@ protected: public: bool start(Variant::mapType &args) override; void end() override; - bool data(Variant &data) override; + bool data(TokenizedData &data) override; }; } } diff --git a/src/core/parser/stack/Stack.cpp b/src/core/parser/stack/Stack.cpp index 5b67248..309c9a0 100644 --- a/src/core/parser/stack/Stack.cpp +++ b/src/core/parser/stack/Stack.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -413,16 +414,24 @@ void Stack::command(const Variant &name, const Variant::mapType &args) } } -void Stack::data(const Variant &data) +void Stack::data(TokenizedData data) { - // End handlers that already had a default field and are currently not - // active. - endOverdueHandlers(); + // TODO: Rewrite this function for token handling + // TODO: This loop needs to be refactored out + while (!data.atEnd()) { + // End handlers that already had a default field and are currently not + // active. + endOverdueHandlers(); - while (true) { - // Check whether there is any command the data can be sent to + const bool hasNonWhitespaceText = data.hasNonWhitespaceText(); + + // Check whether there is any command the data can be sent to -- if not, + // make sure the data actually is data if (stack.empty()) { - throw LoggableException("No command here to receive data.", data); + if (hasNonWhitespaceText) { + throw LoggableException("No command here to receive data.", data); + } + return; } // Fetch the current command handler information @@ -440,7 +449,10 @@ void Stack::data(const Variant &data) // If the "hadDefaultField" flag is set, we already issued an error // message if (!info.hadDefaultField) { - logger().error("Did not expect any data here", data); + if (hasNonWhitespaceText) { + logger().error("Did not expect any data here", data); + } + return; } } @@ -454,8 +466,16 @@ void Stack::data(const Variant &data) // Pass the data to the current Handler instance bool valid = false; try { - Variant dataCopy = data; - valid = info.handler->data(dataCopy); + // Create a fork of the TokenizedData and let the handler work + // on it + TokenizedData dataFork = data; + valid = info.handler->data(dataFork); + + // If the data was validly handled by the handler, commit the + // change + if (valid) { + data = dataFork; + } } catch (LoggableException ex) { loggerFork.log(ex); @@ -482,6 +502,19 @@ void Stack::data(const Variant &data) } } +void Stack::data(const Variant &stringData) +{ + // Fetch the SourceLocation of the given stringData variant + SourceLocation loc = stringData.getLocation(); + + // Create a TokenizedData instance and feed the given string data into it + TokenizedData tokenizedData(loc.getSourceId()); + tokenizedData.append(stringData.asString(), loc.getStart()); + + // Call the actual "data" method + data(tokenizedData); +} + void Stack::fieldStart(bool isDefault) { // Make sure the current handler stack is not empty @@ -584,4 +617,4 @@ void Stack::token(Variant token) // TODO } } -} \ No newline at end of file +} diff --git a/src/core/parser/stack/Stack.hpp b/src/core/parser/stack/Stack.hpp index b67ce82..cd29b28 100644 --- a/src/core/parser/stack/Stack.hpp +++ b/src/core/parser/stack/Stack.hpp @@ -44,6 +44,7 @@ namespace ousia { // Forward declarations class ParserContext; class Logger; +class TokenizedData; namespace parser_stack { @@ -292,13 +293,24 @@ public: void command(const Variant &name, const Variant::mapType &args); /** - * Function that shuold be called whenever character data is found in the + * Function that should be called whenever character data is found in the * input stream. May only be called if the currently is a command on the * stack. * - * @param data is a string variant containing the data that has been found. + * @param data is a TokenizedData instance containing the pre-segmented data + * that should be read. + */ + void data(TokenizedData data); + + /** + * Function that shuold be called whenever character data is found in the + * input stream. The given string variant is converted into a TokenizedData + * instance internally. + * + * @param stringData is a string variant containing the data that has been + * found. */ - void data(const Variant &data); + void data(const Variant &stringData); /** * Function that should be called whenever a new field starts. Fields of the diff --git a/src/core/parser/utils/SourceOffsetVector.hpp b/src/core/parser/utils/SourceOffsetVector.hpp index d15055a..aaebe7d 100644 --- a/src/core/parser/utils/SourceOffsetVector.hpp +++ b/src/core/parser/utils/SourceOffsetVector.hpp @@ -127,7 +127,7 @@ public: * read. * @return a pair containing start and end source offset. */ - std::pair loadOffset(size_t idx) + std::pair loadOffset(size_t idx) const { // Special treatment for the last character const size_t count = lens.size(); @@ -157,7 +157,31 @@ public: /** * Returns the number of characters for which offsets are stored. */ - size_t size() { return lens.size(); } + size_t size() const { return lens.size(); } + + /** + * Trims the length of the TokenizedData instance to the given length. + * Removes all token matches that lie within the trimmed region. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) { + if (length < size()) { + lens.resize(length); + offsets.resize((length >> LOG2_OFFSET_INTERVAL) + 1); + } + } + + /** + * Resets the SourceOffsetVector to the state it had when it was + * constructed. + */ + void clear() { + lens.clear(); + offsets.clear(); + lastEnd = 0; + } }; } diff --git a/src/core/parser/utils/Token.cpp b/src/core/parser/utils/Token.cpp deleted file mode 100644 index 8bcdbb5..0000000 --- a/src/core/parser/utils/Token.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "Token.hpp" - -namespace ousia { -// Stub to make sure Tokens.hpp is valid -} - diff --git a/src/core/parser/utils/Token.hpp b/src/core/parser/utils/Token.hpp deleted file mode 100644 index f907450..0000000 --- a/src/core/parser/utils/Token.hpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file Token.hpp - * - * Definition of the TokenId id and constants for some special tokens. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_HPP_ -#define _OUSIA_TOKEN_HPP_ - -#include -#include -#include - -#include - -namespace ousia { - -/** - * The TokenId is used to give each token id a unique id. - */ -using TokenId = uint32_t; - -/** - * Type used for storing token lengths. - */ -using TokenLength = uint16_t; - -/** - * Namespace containing constants for TokenId instances with special meaning. - */ -namespace Tokens { -/** - * Token which is not a token. - */ -constexpr TokenId Empty = std::numeric_limits::max(); - -/** - * Token which represents data (represented as TokenizedData). - */ -constexpr TokenId Data = std::numeric_limits::max() - 1; - -/** - * Token which represents a newline token. - */ -constexpr TokenId Newline = std::numeric_limits::max() - 2; - -/** - * Token which represents a paragraph token -- issued if two consecutive - * newlines occur with optionally any amout of whitespace between them. - */ -constexpr TokenId Paragraph = std::numeric_limits::max() - 3; - -/** - * Token which represents an indentation token -- issued if the indentation of - * this line is larget than the indentation of the previous line. - */ -constexpr TokenId Indentation = std::numeric_limits::max() - 4; - -/** - * Maximum token id to be used. Tokens allocated for users should not surpass - * this value. - */ -constexpr TokenId MaxTokenId = std::numeric_limits::max() - 255; -} - -/** - * The Token structure describes a token discovered by the Tokenizer or read - * from the TokenizedData struct. - */ -struct Token { - /** - * Id of the id of this token. - */ - TokenId id; - - /** - * String that was matched. - */ - std::string content; - - /** - * Location from which the string was extracted. - */ - SourceLocation location; - - /** - * Default constructor. - */ - Token() : id(Tokens::Empty) {} - - /** - * Constructor of the Token struct. - * - * @param id represents the token id. - * @param content is the string content that has been extracted. - * @param location is the location of the extracted string content in the - * source file. - */ - Token(TokenId id, const std::string &content, SourceLocation location) - : id(id), content(content), location(location) - { - } - - /** - * Constructor of the Token struct, only initializes the token id - * - * @param id is the id corresponding to the id of the token. - */ - Token(TokenId id) : id(id) {} - - /** - * The getLocation function allows the tokens to be directly passed as - * parameter to Logger or LoggableException instances. - * - * @return a reference at the location field - */ - const SourceLocation &getLocation() const { return location; } -}; -} - -#endif /* _OUSIA_TOKENS_HPP_ */ - diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp index 80cc945..a45d3ff 100644 --- a/src/core/parser/utils/TokenTrie.cpp +++ b/src/core/parser/utils/TokenTrie.cpp @@ -22,12 +22,12 @@ namespace ousia { /* Class DynamicTokenTree::Node */ -TokenTrie::Node::Node() : type(Tokens::Empty) {} +TokenTrie::Node::Node() : id(Tokens::Empty) {} /* Class DynamicTokenTree */ bool TokenTrie::registerToken(const std::string &token, - TokenId type) noexcept + TokenId id) noexcept { // Abort if the token is empty -- this would taint the root node if (token.empty()) { @@ -48,12 +48,12 @@ bool TokenTrie::registerToken(const std::string &token, } // If the resulting node already has a type set, we're screwed. - if (node->type != Tokens::Empty) { + if (node->id != Tokens::Empty) { return false; } // Otherwise just set the type to the given type. - node->type = type; + node->id = id; return true; } @@ -78,7 +78,7 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept // Reset the subtree handler if this node has another type node = it->second.get(); - if ((node->type != Tokens::Empty || node->children.size() > 1) && + if ((node->id != Tokens::Empty || node->children.size() > 1) && (i + 1 != token.size())) { subtreeRoot = node; subtreeKey = token[i + 1]; @@ -86,14 +86,14 @@ bool TokenTrie::unregisterToken(const std::string &token) noexcept } // If the node type is already Tokens::Empty, we cannot do anything here - if (node->type == Tokens::Empty) { + if (node->id == Tokens::Empty) { return false; } // If the target node has children, we cannot delete the subtree. Set the // type to Tokens::Empty instead if (!node->children.empty()) { - node->type = Tokens::Empty; + node->id = Tokens::Empty; return true; } @@ -113,7 +113,7 @@ TokenId TokenTrie::hasToken(const std::string &token) const noexcept } node = it->second.get(); } - return node->type; + return node->id; } } diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp index b2d1539..c470acc 100644 --- a/src/core/parser/utils/TokenTrie.hpp +++ b/src/core/parser/utils/TokenTrie.hpp @@ -33,7 +33,7 @@ #include #include -#include "Token.hpp" +#include namespace ousia { @@ -75,10 +75,9 @@ public: ChildMap children; /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. + * Id of the token represented by this node. */ - TokenId type; + TokenId id; /** * Default constructor, initializes the descriptor with nullptr. @@ -99,10 +98,10 @@ public: * * @param token is the character sequence that should be registered as * token. - * @param type is the descriptor that should be set for this token. + * @param id is the descriptor that should be set for this token. * @return true if the operation is successful, false otherwise. */ - bool registerToken(const std::string &token, TokenId type) noexcept; + bool registerToken(const std::string &token, TokenId id) noexcept; /** * Unregisters the token from the token tree. Returns true if the token was diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp index fc7bfaf..0ec56af 100644 --- a/src/core/parser/utils/TokenizedData.cpp +++ b/src/core/parser/utils/TokenizedData.cpp @@ -110,19 +110,19 @@ private: std::vector buf; /** - * Vector containing all token marks. + * Vector storing all the character offsets efficiently. */ - std::vector marks; + SourceOffsetVector offsets; /** - * Vector storing all the character offsets efficiently. + * Vector containing all token marks. */ - SourceOffsetVector offsets; + mutable std::vector marks; /** * Flag indicating whether the internal "marks" vector is sorted. */ - bool sorted; + mutable bool sorted; public: /** @@ -150,9 +150,12 @@ public: // Extend the text regions, interpolate the source position (this may // yield incorrect results) const size_t size = buf.size(); - for (SourceOffset offs = offsStart; offs < offsStart + data.size(); - offs++) { - offsets.storeOffset(offs, offs + 1); + for (size_t i = 0; i < data.size(); i++) { + if (offsStart != InvalidSourceOffset) { + offsets.storeOffset(offsStart + i, offsStart + i + 1); + } else { + offsets.storeOffset(InvalidSourceOffset, InvalidSourceOffset); + } } return size; @@ -213,7 +216,7 @@ public: * available. */ bool next(Token &token, WhitespaceMode mode, - const std::unordered_set &tokens, size_t &cursor) + const std::unordered_set &tokens, size_t &cursor) const { // Sort the "marks" vector if it has not been sorted yet. if (!sorted) { @@ -222,10 +225,11 @@ public: } // Fetch the next larger TokenMark instance, make sure the token is in - // the "enabled" list + // the "enabled" list and within the buffer range auto it = std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); - while (it != marks.end() && tokens.count(it->id) == 0) { + while (it != marks.end() && (tokens.count(it->id) == 0 || + it->bufStart + it->len > buf.size())) { it++; } @@ -303,12 +307,59 @@ public: return false; } + /** + * Resets the TokenizedDataImpl instance to the state it had when it was + * constructred. + */ + void clear() + { + buf.clear(); + marks.clear(); + offsets.clear(); + sorted = true; + } + + /** + * Trims the length of the TokenizedDataImpl instance to the given length. + * + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. + */ + void trim(size_t length) + { + if (length < size()) { + buf.resize(length); + offsets.trim(length); + } + } + /** * Returns the current size of the internal buffer. * * @return the size of the internal character buffer. */ - size_t getSize() { return buf.size(); } + size_t size() const { return buf.size(); } + + /** + * Returns true if no data is in the data buffer. + * + * @return true if the "buf" instance has no data. + */ + bool empty() const { return buf.empty(); } + + /** + * Returns the current location of all data in the buffer. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const + { + if (empty()) { + return SourceLocation{sourceId}; + } + return SourceLocation{sourceId, offsets.loadOffset(0).first, + offsets.loadOffset(size()).second}; + } }; /* Class TokenizedData */ @@ -335,7 +386,7 @@ size_t TokenizedData::append(char c, SourceOffset offsStart, void TokenizedData::mark(TokenId id, TokenLength len) { - impl->mark(id, impl->getSize() - len, len); + impl->mark(id, impl->size() - len, len); } void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) @@ -343,23 +394,67 @@ void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) impl->mark(id, bufStart, len); } -bool TokenizedData::next(Token &token, WhitespaceMode mode) +void TokenizedData::clear() { - return impl->next(token, mode, tokens, cursor); + impl->clear(); + tokens.clear(); + cursor = 0; } -bool TokenizedData::text(Token &token, WhitespaceMode mode) +void TokenizedData::trim(size_t length) { impl->trim(length); } + +size_t TokenizedData::size() const { return impl->size(); } + +bool TokenizedData::empty() const { return impl->empty(); } + +SourceLocation TokenizedData::getLocation() const +{ + return impl->getLocation(); +} + +TokenizedDataReader reader() const +{ + return TokenizedDataReader(impl, std::unordered_set{}, 0, 0); +} + +/* Class TokenizedDataReader */ + +TokenizedDataReaderFork TokenizedDataReader::fork() +{ + return TokenizedDataReaderFork(*this, impl, tokens, readCursor, peekCursor); +} + +bool TokenizedDataReader::atEnd() const { return readCursor >= size(); } + +bool TokenizedData::read(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + peekCursor = readCursor; + return impl->next(token, mode, tokens, readCursor); +} + +bool TokenizedData::peek(Token &token, const TokenSet &tokens, + WhitespaceMode mode) +{ + return impl->next(token, mode, tokens, peekCursor); +} + +Variant TokenizedData::text(WhitespaceMode mode) { // Copy the current cursor position to not update the actual cursor position // if the operation was not successful size_t cursorCopy = cursor; + Token token; if (!impl->next(token, mode, tokens, cursorCopy) || token.id != Tokens::Data) { - return false; + return Variant{nullptr}; } - // There is indeed a text token, update the internal cursor position + // There is indeed a text token, update the internal cursor position and + // return the token as variant. cursor = cursorCopy; - return true; + Variant res = Variant::fromString(token.content); + res.setLocation(token.getLocation()); + return res; } } diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp index 38125c4..85b80ae 100644 --- a/src/core/parser/utils/TokenizedData.hpp +++ b/src/core/parser/utils/TokenizedData.hpp @@ -36,42 +36,29 @@ #include #include +#include #include - -#include "Token.hpp" +#include namespace ousia { // Forward declaration class TokenizedDataImpl; +class TokenizedDataReader; +class TokenizedDataReaderFork; /** * The TokenizedData class stores data extracted from a user defined document. - * As users are capable of defining their own tokens and these are only valid - * in certain scopes TokenizedData allows to divide the stored data into chunks - * separated by tokens. + * The data stored in TokenizedData */ class TokenizedData { private: /** - * Shared pointer pointing at the internal data. This data is shared when - * copying TokenizedData instances, which corresponds to forking a - * TokenizedData instance. + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. */ std::shared_ptr impl; - /** - * Contains all currently enabled token ids. - */ - std::unordered_set tokens; - - /** - * Position from which the last element was read from the internal buffer. - * This information is not shared with the other instances of TokenizedData - * pointing at the same location. - */ - size_t cursor; - public: /** * Default constructor, creates a new instance of TokenizedData, sets the @@ -136,25 +123,121 @@ public: void mark(TokenId id, size_t bufStart, TokenLength len); /** - * Enables a single token id. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Resets the TokenizedData instance to the state it had when it was + * constructred. + */ + void clear(); + + /** + * Trims the length of the TokenizedData instance to the given length. Note + * that this function does not remove any token matches for performance + * reasons, it merely renders them incaccessible. Appending new data after + * calling trim will make the token marks accessible again. Thus this method + * should be the last function called to modify the data buffer and the + * token marks. * - * @param id is the TokenId of the token that should be enabled. + * @param length is the number of characters to which the TokenizedData + * instance should be trimmed. */ - void enableToken(TokenId id) { tokens.insert(id); } + void trim(size_t length); /** - * Enables a set of token ids. Enabled tokens will no longer be returned as - * text. Instead, when querying for the next token, TokenizedData will - * return them as token and not as part of a Text token. + * Returns the number of characters currently represented by this + * TokenizedData instance. + */ + size_t size() const; + + /** + * Returns true if the TokenizedData instance is empty, false otherwise. * - * @param ids is the TokenId of the token that should be enabled. + * @return true if not data is stored inside the TokenizedData instance. */ - void enableToken(const std::unordered_set &ids) - { - tokens.insert(ids.begin(), ids.end()); - } + bool empty() const; + + /** + * Returns the location of the entire TokenizedData instance. + * + * @return the location of the entire data represented by this instance. + */ + SourceLocation getLocation() const; + + /** + * Returns a TokenizedDataReader instance that can be used to access the + * data. + * + * @return a new TokenizedDataReader instance pointing at the beginning of + * the internal buffer. + */ + TokenizedDataReader reader() const; +}; + +/** + * The TokenizedDataReader + */ +class TokenizedDataReader { +private: + friend TokenizedData; + + /** + * Shared pointer pointing at the internal data. This data is shared with + * all the TokenizedDataReader instances. + */ + std::shared_ptr impl; + + /** + * Position from which the last element was read from the internal buffer. + */ + size_t readCursor; + + /** + * Position from which the last element was peeked from the internal buffer. + */ + size_t peekCursor; + + /** + * Private constructor of TokenizedDataReader, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader. + * + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReader(std::shared_ptr impl, + size_t readCursor, size_t peekCursor); + +public: + /** + * Returns a new TokenizedDataReaderFork from which tokens and text can be + * read without advancing this reader instance. + */ + TokenizedDataReaderFork fork(); + + /** + * Returns true if this TokenizedData instance is at the end. + * + * @return true if the end of the TokenizedData instance has been reached. + */ + bool atEnd() const; + + /** + * Stores the next token in the given token reference, returns true if the + * operation was successful, false if there are no more tokens. Advances the + * internal cursor and re + * + * @param token is an output parameter into which the read token will be + * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. + * @param mode is the whitespace mode that should be used when a text token + * is returned. + * @return true if the operation was successful and there is a next token, + * false if there are no more tokens. + */ + bool read(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::COLLAPSE); /** * Stores the next token in the given token reference, returns true if the @@ -162,12 +245,26 @@ public: * * @param token is an output parameter into which the read token will be * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param tokens is the set of token identifers, representing the currently + * enabled tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. * @return true if the operation was successful and there is a next token, * false if there are no more tokens. */ - bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + bool peek(Token &token, const TokenSet &tokens = TokenSet{}, + WhitespaceMode mode = WhitespaceMode::COLLAPSE); + + /** + * Consumes the peeked tokens, the read cursor will now be at the position + * of the peek cursor. + */ + void consumePeek() { readCursor = peekCursor; } + + /** + * Resets the peek cursor to the position of the read cursor. + */ + void resetPeek() { peekCursor = readCursor; } /** * Stores the next text token in the given token reference, returns true if @@ -178,12 +275,53 @@ public: * stored. The TokenId is set to Tokens::Empty if there are no more tokens. * @param mode is the whitespace mode that should be used when a text token * is returned. - * @return true if the operation was successful and there is a next token, - * false if there are no more tokens. + * @return a string variant with the data if there is any data or a nullptr + * variant if there is no text. */ - bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + Variant text(WhitespaceMode mode = WhitespaceMode::COLLAPSE); }; + +/** + * The TokenizedDataReaderFork class is created when forking a + * TokenizedDataReader + */ +class TokenizedDataReaderFork : public TokenizedDataReader { +private: + friend TokenizedDataReader; + + /** + * Reference pointing at the parent TokenizedDataReader to which changes may + * be commited. + */ + TokenizedDataReader &parent; + + /** + * Private constructor of TokenizedDataReaderFork, taking a reference to the + * internal TokenizedDataImpl structure storing the data that is accessed by + * the reader and a reference at the parent TokenizedDataReader. + * + * @param parent is the TokenizedDataReader instance to which the current + * read/peek progress may be commited. + * @param impl is the TokenizedDataImpl instance that holds the actual data. + * @param readCursor is the cursor position from which tokens and text are + * read. + * @param peekCursor is the cursor position from which tokens and text are + * peeked. + */ + TokenizedDataReaderFork(TokenizedDataReader &parent, + std::shared_ptr impl, + size_t readCursor, size_t peekCursor) + : TokenizedDataReader(impl, readCursor, peekCursor), parent(parent) + { + } + +public: + /** + * Commits the read/peek progress to the underlying parent. + */ + void commit() { parent = *this; } +} } -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ +#endif /* _OUSIA_TOKENIZED_DATA_HPP_ */ diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp index 2e0ac13..51787cd 100644 --- a/src/core/parser/utils/Tokenizer.cpp +++ b/src/core/parser/utils/Tokenizer.cpp @@ -22,8 +22,8 @@ #include #include #include -#include +#include "TokenizedData.hpp" #include "Tokenizer.hpp" namespace ousia { @@ -42,26 +42,33 @@ struct TokenMatch { Token token; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; + size_t dataStartOffset; /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. + * Set to true if the matched token is a primary token. */ - size_t textEnd; + bool primary; /** * Constructor of the TokenMatch class. */ - TokenMatch() : textLength(0), textEnd(0) {} + TokenMatch() : dataStartOffset(0), primary(false) {} /** * Returns true if this TokenMatch instance actually represents a match. + * + * @return true if the TokenMatch actually has a match. + */ + bool hasMatch() const { return token.id != Tokens::Empty; } + + /** + * Returns the length of the matched token. + * + * @return the length of the token string. */ - bool hasMatch() { return token.id != Tokens::Empty; } + size_t size() const { return token.content.size(); } }; /* Internal class TokenLookup */ @@ -83,36 +90,28 @@ private: size_t start; /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. + * Position at which this token starts in the TokenizedData instance. */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; + size_t dataStartOffset; public: /** * Constructor of the TokenLookup class. * * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. + * @param start is the start position in the source file. + * @param dataStartOffset is the current length of the TokenizedData buffer. */ - TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, - size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) + TokenLookup(const TokenTrie::Node *node, size_t start, + size_t dataStartOffset) + : node(node), start(start), dataStartOffset(dataStartOffset) { } /** * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). + * character. If a complete token is matched, stores the match in the given + * TokenMatch reference and returns true. * * @param c is the character that should be appended to the current prefix. * @param lookups is a list to which new TokeLookup instances are added -- @@ -123,73 +122,48 @@ public: * Tokenizer. * @param end is the end byte offset of the current character. * @param sourceId is the source if of this file. + * @return true if a token was matched, false otherwise. */ - void advance(char c, std::vector &lookups, TokenMatch &match, - const std::vector &tokens, SourceOffset end, - SourceId sourceId) + bool advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, + SourceOffset end, SourceId sourceId) { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node + // Set to true once a token has been matched + bool res = false; + + // Check whether we can continue the current token path, if not, abort auto it = node->children.find(c); if (it == node->children.end()) { - return; + return res; } // Check whether the new node represents a complete token a whether it // is longer than the current token. If yes, replace the current token. node = it->second.get(); - if (node->type != Tokens::Empty) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - Token{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } + if (node->id != Tokens::Empty) { + const Tokenizer::TokenDescriptor &descr = tokens[node->id]; + match.token = Token(node->id, descr.string, + SourceLocation(sourceId, start, end)); + match.dataStartOffset = dataStartOffset; + match.primary = descr.primary; + res = true; } // If this state can possibly be advanced, store it in the states list. if (!node->children.empty()) { lookups.emplace_back(*this); } + return res; } }; - -/** - * Transforms the given token into a data token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildDataToken(const WhitespaceHandler &handler, TokenMatch &match, - SourceId sourceId) -{ - if (match.hasMatch()) { - match.token.content = - std::string{handler.textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, handler.textStart, match.textEnd}; - } else { - match.token.content = handler.toString(); - match.token.location = - SourceLocation{sourceId, handler.textStart, handler.textEnd}; - } - match.token.id = Tokens::Data; -} } /* Class Tokenizer */ -Tokenizer::Tokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenId(0) -{ -} +Tokenizer::Tokenizer() : nextTokenId(0) {} -template -bool Tokenizer::next(CharReader &reader, Token &token) +template +bool Tokenizer::next(CharReader &reader, Token &token, TokenizedData &data) { // If we're in the read mode, reset the char reader peek position to the // current read position @@ -199,43 +173,68 @@ bool Tokenizer::next(CharReader &reader, Token &token) // Prepare the lookups in the token trie const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; + TokenMatch bestMatch; std::vector lookups; std::vector nextLookups; - // Instantiate the text handler - TextHandler textHandler; - // Peek characters from the reader and try to advance the current token tree // cursor char c; + const size_t initialDataSize = data.size(); size_t charStart = reader.getPeekOffset(); const SourceId sourceId = reader.getSourceId(); while (reader.peek(c)) { const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; + const size_t dataStartOffset = data.size(); // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); + if (!bestMatch.hasMatch()) { + lookups.emplace_back(root, charStart, dataStartOffset); } // Try to advance all other lookups with the new character + TokenMatch match; for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + // Continue if the current lookup + if (!lookup.advance(c, nextLookups, match, tokens, charEnd, + sourceId)) { + continue; + } + + // If the matched token is primary, check whether it is better than + // the current best match, if yes, replace the best match. In any + // case just continue + if (match.primary) { + if (match.size() > bestMatch.size()) { + bestMatch = match; + } + continue; + } + + // Otherwise -- if the matched token is a non-primary token (and no + // primary token has been found until now) -- mark the match in the + // TokenizedData + if (!bestMatch.hasMatch()) { + data.mark(match.token.id, data.size() - match.size() + 1, + match.size()); + } } // We have found a token and there are no more states to advance or the // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { + if (bestMatch.hasMatch()) { + if ((nextLookups.empty() || data.size() > initialDataSize)) { break; } } else { // Record all incomming characters - textHandler.append(c, charStart, charEnd); + data.append(c, charStart, charEnd); + + // Special token processing + // TODO: Build a special state machine for this in another class + if (c == '\n') { + data.mark(Tokens::Newline, 1); + } } // Swap the lookups and the nextLookups list @@ -246,60 +245,53 @@ bool Tokenizer::next(CharReader &reader, Token &token) charStart = charEnd; } - // If we found text, emit that text - if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildDataToken(textHandler, match, sourceId); + // If we found data, emit a corresponding data token + if (data.size() > initialDataSize && + (!bestMatch.hasMatch() || + bestMatch.dataStartOffset > initialDataSize)) { + // If we have a "bestMatch" wich starts after text data has started, + // trim the TokenizedData to this offset + if (bestMatch.dataStartOffset > initialDataSize) { + data.trim(bestMatch.dataStartOffset); + } + + // Create a token containing the data location + bestMatch.token = Token{data.getLocation()}; } // Move the read/peek cursor to the end of the token, abort if an error // happens while doing so - if (match.hasMatch()) { + if (bestMatch.hasMatch()) { // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { + if (bestMatch.token.location.getEnd() == InvalidSourceOffset) { throw OusiaException{"Token end position offset out of range"}; } // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); + const size_t end = bestMatch.token.location.getEnd(); if (read) { reader.seek(end); } else { reader.seekPeekCursor(end); } - token = match.token; + token = bestMatch.token; } else { token = Token{}; } - return match.hasMatch(); + return bestMatch.hasMatch(); } -bool Tokenizer::read(CharReader &reader, Token &token) +bool Tokenizer::read(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; + return next(reader, token, data); } -bool Tokenizer::peek(CharReader &reader, Token &token) +bool Tokenizer::peek(CharReader &reader, Token &token, TokenizedData &data) { - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; + return next(reader, token, data); } -TokenId Tokenizer::registerToken(const std::string &token) +TokenId Tokenizer::registerToken(const std::string &token, bool primary) { // Abort if an empty token should be registered if (token.empty()) { @@ -309,8 +301,8 @@ TokenId Tokenizer::registerToken(const std::string &token) // Search for a new slot in the tokens list TokenId type = Tokens::Empty; for (size_t i = nextTokenId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; + if (!tokens[i].valid()) { + tokens[i] = TokenDescriptor(token, primary); type = i; break; } @@ -320,62 +312,47 @@ TokenId Tokenizer::registerToken(const std::string &token) // override the special token type handles if (type == Tokens::Empty) { type = tokens.size(); - if (type == Tokens::Data || type == Tokens::Empty) { + if (type >= Tokens::MaxTokenId) { throw OusiaException{"Token type ids depleted!"}; } - tokens.emplace_back(token); + tokens.emplace_back(token, primary); } nextTokenId = type + 1; - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list + // Try to register the token in the trie -- if this fails, remove it from + // the tokens list if (!trie.registerToken(token, type)) { - tokens[type] = std::string{}; + tokens[type] = TokenDescriptor(); nextTokenId = type; return Tokens::Empty; } return type; } -bool Tokenizer::unregisterToken(TokenId type) +bool Tokenizer::unregisterToken(TokenId id) { // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenId = type; + if (id < tokens.size() && trie.unregisterToken(tokens[id].string)) { + tokens[id] = TokenDescriptor(); + nextTokenId = id; return true; } return false; } -std::string Tokenizer::getTokenString(TokenId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} +static Tokenizer::TokenDescriptor EmptyTokenDescriptor; -void Tokenizer::setWhitespaceMode(WhitespaceMode mode) +const Tokenizer::TokenDescriptor &Tokenizer::lookupToken(TokenId id) const { - whitespaceMode = mode; + if (id < tokens.size()) { + return tokens[id]; + } + return EmptyTokenDescriptor; } -WhitespaceMode Tokenizer::getWhitespaceMode() { return whitespaceMode; } - /* Explicitly instantiate all possible instantiations of the "next" member function */ -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); -template bool Tokenizer::next( - CharReader &reader, Token &token); +template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); +template bool Tokenizer::next(CharReader &, Token &, TokenizedData &); } diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp index f21c6a3..2ddb9c9 100644 --- a/src/core/parser/utils/Tokenizer.hpp +++ b/src/core/parser/utils/Tokenizer.hpp @@ -19,8 +19,8 @@ /** * @file Tokenizer.hpp * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. + * Tokenizer that can be reconfigured at runtime and is used for parsing the + * plain text format. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ @@ -33,39 +33,75 @@ #include #include -#include +#include -#include "Token.hpp" #include "TokenTrie.hpp" namespace ousia { // Forward declarations class CharReader; +class TokenizedData; /** * The Tokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * Tokenizer always tries to extract the longest possible token from the - * tokenizer. + * CharReader. It allows to register and unregister tokens while parsing. Note + * that the Tokenizer always tries to extract the longest possible token from + * the tokenizer. Tokens can be registered as primary or non-primary token. If + * a Token is registered as a primary token, it is returned as a single Token + * instance if it occurs. In the non-primary case the token is returned as part + * of a segmented TokenizedData instance. */ class Tokenizer { -private: +public: /** - * Internally used token trie. This object holds all registered tokens. + * Internally used structure describing a registered token. */ - TokenTrie trie; + struct TokenDescriptor { + /** + * String describing the token. + */ + std::string string; + + /** + * Set to true if this token is primary. + */ + bool primary; + + /** + * Constructor of the TokenDescriptor class. + * + * @param string is the string representation of the registered token. + * @param primary specifies whether the token is a primary token that + * should be returned as a single token, or a secondary token, that + * should be returned as part of TokenizedData. + */ + TokenDescriptor(const std::string &string, bool primary) + : string(string), primary(primary) + { + } + + /** + * Default constructor. + */ + TokenDescriptor() : primary(false) {} + + /** + * Returns true if the TokenDescriptor represents a valid token. + */ + bool valid() { return !string.empty(); } + }; +private: /** - * Flag defining whether whitespaces should be preserved or not. + * Internally used token trie. This object holds all registered tokens. */ - WhitespaceMode whitespaceMode; + TokenTrie trie; /** * Vector containing all registered token types. */ - std::vector tokens; + std::vector tokens; /** * Next index in the tokens list where to search for a new token id. @@ -74,90 +110,78 @@ private: /** * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. + * function is templated in order to force optimized code generation for + * both reading and peeking. * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. + * @tparam read specifies whether the method should read the token or just + * peek. * @param reader is the CharReader instance from which the data should be * read. * @param token is the token structure into which the token information * should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return false if the end of the stream has been reached, true otherwise. */ - template - bool next(CharReader &reader, Token &token); + template + bool next(CharReader &reader, Token &token, TokenizedData &data); public: /** * Constructor of the Tokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. */ - Tokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + Tokenizer(); /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. + * Registers the given string as a token. Returns a unique identifier + * describing the registered token. * * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if + * @param primary specifies whether the token is a primary token -- if true, + * the token will be returned as a single, standalone token. Otherwise the + * token will be returned as part of a "TokenizedData" structure. + * @return a unique identifier for the registered token or Tokens::Empty if * an error occured. */ - TokenId registerToken(const std::string &token); + TokenId registerToken(const std::string &token, bool primary = true); /** * Unregisters the token belonging to the given TokenId. * * @param type is the token type that should be unregistered. The - *TokenId - * must have been returned by registerToken. + * TokenId must have been returned by registerToken. * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). + * because the token with the given TokenId was already unregistered). */ - bool unregisterToken(TokenId type); + bool unregisterToken(TokenId id); /** * Returns the token that was registered under the given TokenId id or - *an - * empty string if an invalid TokenId id is given. + * an empty string if an invalid TokenId id is given. * - * @param type is the TokenId id for which the corresponding token - *string + * @param id is the TokenId for which the corresponding TokenDescriptor * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. + * @return the registered TokenDescriptor or an invalid TokenDescriptor if + * the given TokenId is invalid. */ - WhitespaceMode getWhitespaceMode(); + const TokenDescriptor& lookupToken(TokenId id) const; /** * Reads a new token from the CharReader and stores it in the given - * Token instance. + * Token instance. If the token has the id Tokens::Data, use the "getData" + * method to fetch a reference at the underlying TokenizedData instance + * storing the data. * * @param reader is the CharReader instance from which the data should be * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool read(CharReader &reader, Token &token); + bool read(CharReader &reader, Token &token, TokenizedData &data); /** * The peek method does not advance the read position of the char reader, @@ -167,10 +191,12 @@ public: * read. * @param token is a reference at the token instance into which the Token * information should be written. + * @param data is a reference at the TokenizedData instance to which the + * token information should be appended. * @return true if a token could be read, false if the end of the stream * has been reached. */ - bool peek(CharReader &reader, Token &token); + bool peek(CharReader &reader, Token &token, TokenizedData &data); }; } -- cgit v1.2.3