diff options
author | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-22 23:06:54 +0100 |
---|---|---|
committer | Andreas Stöckel <astoecke@techfak.uni-bielefeld.de> | 2015-02-22 23:06:54 +0100 |
commit | cb697e7eb78ad0bdfc2a20a7bdd2c369b678ca09 (patch) | |
tree | b75bb61ce1467fca50133c2dd5c6bcdc551dc0e8 /src/core | |
parent | 6ace07685cbaa81338ec5e68487054dcbf9da969 (diff) |
Implemented TokenizedData, a facility to store data with tokens where tokens can be dynamically enabled and the whitespace mode specified at the moment the tokens are read
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/parser/utils/TokenizedData.cpp | 365 | ||||
-rw-r--r-- | src/core/parser/utils/TokenizedData.hpp | 189 |
2 files changed, 554 insertions, 0 deletions
diff --git a/src/core/parser/utils/TokenizedData.cpp b/src/core/parser/utils/TokenizedData.cpp new file mode 100644 index 0000000..fc7bfaf --- /dev/null +++ b/src/core/parser/utils/TokenizedData.cpp @@ -0,0 +1,365 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <algorithm> +#include <limits> +#include <vector> + +#include <core/common/Utils.hpp> + +#include "SourceOffsetVector.hpp" +#include "TokenizedData.hpp" + +namespace ousia { +namespace { +/** + * Structure used to represent the position of a token in the internal + * character buffer. + */ +struct TokenMark { + /** + * Relative position of the token in the buffer. + */ + size_t bufStart; + + /** + * TokenId of the associated token. + */ + TokenId id; + + /** + * Length of the token. + */ + TokenLength len; + + /** + * Constructor of the TokenMark structure, initializes all members with the + * given values. + * + * @param id is the id of the token that is marked here. + * @param bufStart is the start position of the TokenMark in the internal + * character buffer. + * @param len is the length of the token. + */ + TokenMark(TokenId id, size_t bufStart, TokenLength len) + : bufStart(bufStart), id(id), len(len) + { + } + + /** + * Creates a dummy TokenMark instance used for comparison purposes. This + * TokenMark will compare to be smaller than an equal TokenMark with + * equivalent start. + * + * @param bufStart is start position of the TokenMark in the internal + * character buffer. + */ + TokenMark(size_t bufStart) + : bufStart(bufStart), + id(Tokens::Empty), + len(std::numeric_limits<TokenLength>::max()) + { + } + + /** + * Operator used for sorting TokenMark instances. They are sorted in such a + * way that the instances with smallest bufStart come first and for equal + * bufStart values instances with the largest length come first. + * + * @param m1 is the left-hand side TokenMark instance of the comparison. + * @param m2 is the right-hand side TokenMark instance of the comparison. + */ + friend bool operator<(const TokenMark &m1, const TokenMark &m2) + { + return (m1.bufStart < m2.bufStart) || + (m1.bufStart == m2.bufStart && m1.len > m2.len); + } +}; +} + +/** + * Structure used to hold all the internal data structures that may be + * exchanged between TokenizedData instances. + */ +class TokenizedDataImpl { +private: + /** + * SourceId representing the source file from which the current content is + * being read. + */ + SourceId sourceId; + + /** + * Buffer containing the actual character data. + */ + std::vector<char> buf; + + /** + * Vector containing all token marks. + */ + std::vector<TokenMark> marks; + + /** + * Vector storing all the character offsets efficiently. + */ + SourceOffsetVector offsets; + + /** + * Flag indicating whether the internal "marks" vector is sorted. + */ + bool sorted; + +public: + /** + * Constructor of TokenizedDataImpl. Takes the SourceId that should be used + * for returned tokens. + * + * @param sourceId is the source identifier that should be used for + * constructing the location when returning tokens. + */ + TokenizedDataImpl(SourceId sourceId) : sourceId(sourceId), sorted(true) {} + + /** + * Appends a complete string to the internal character buffer and extends + * the text regions in the regions map. + * + * @param data is the string that should be appended to the buffer. + * @param offsStart is the start offset in bytes in the input file. + * @return the current size of the internal byte buffer. The returned value + * is intended to be used for the "mark" function. + */ + size_t append(const std::string &data, SourceOffset offsStart) + { // Append the data to the internal buffer + buf.insert(buf.end(), data.begin(), data.end()); + + // Extend the text regions, interpolate the source position (this may + // yield incorrect results) + const size_t size = buf.size(); + for (SourceOffset offs = offsStart; offs < offsStart + data.size(); + offs++) { + offsets.storeOffset(offs, offs + 1); + } + + return size; + } + + /** + * Appends a single character to the internal character buffer and extends + * the text regions in the regions map. + * + * @param c is the character that should be appended to the buffer. + * @param offsStart is the start offset in bytes in the input file. + * @param offsEnd is the end offset in bytes in the input file. + * @return the current size of the internal byte buffer. The returned value + * is intended to be used for the "mark" function. + */ + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd) + { + // Add the character to the list and store the location of the character + // in the source file + buf.push_back(c); + offsets.storeOffset(offsStart, offsEnd); + return buf.size(); + } + + /** + * Stores a token at the given position. + * + * @param id is the token that should be stored. + * @param bufStart is the start position in the internal buffer. Use the + * values returned by append to calculate the start position. + * @param len is the length of the token. + */ + void mark(TokenId id, size_t bufStart, TokenLength len) + { + // Push the new instance back onto the list + marks.emplace_back(id, bufStart, len); + + // Update the sorted flag as soon as more than one element is in the + // list + if (marks.size() > 1U) { + sorted = sorted && *(marks.end() - 2) < *(marks.end() - 1); + } + } + + /** + * Returns the next token or a text token if no explicit token is available. + * Advances the given cursor to the end of the returned token. + * + * @param token is the Token instance to which the token should be written. + * @param mode is the WhitespaceMode to be used for extracting the text + * cursor. + * @param tokens is a set of enabled tokens. Tokens that are not in this set + * are ignored and returned as text. + * @param cursor is the position in the character buffer from which on the + * next token should be read. The cursor will be updated to the position + * beyond the returned token. + * @return true if a token was returned, false if no more tokens are + * available. + */ + bool next(Token &token, WhitespaceMode mode, + const std::unordered_set<TokenId> &tokens, size_t &cursor) + { + // Sort the "marks" vector if it has not been sorted yet. + if (!sorted) { + std::sort(marks.begin(), marks.end()); + sorted = true; + } + + // Fetch the next larger TokenMark instance, make sure the token is in + // the "enabled" list + auto it = + std::lower_bound(marks.begin(), marks.end(), TokenMark(cursor)); + while (it != marks.end() && tokens.count(it->id) == 0) { + it++; + } + + // Calculate the buffer start and end character, based on the returned + // TokenMark instance + const size_t end = (it != marks.end()) ? it->bufStart : buf.size(); + + // Depending on the whitespace mode, fetch all the data between the + // cursor position and the calculated end position and return a token + // containing that data. + if (cursor < end && cursor < buf.size()) { + switch (mode) { + case WhitespaceMode::PRESERVE: { + token = Token( + Tokens::Data, std::string(&buf[cursor], end - cursor), + SourceLocation(sourceId, + offsets.loadOffset(cursor).first, + offsets.loadOffset(end).first)); + cursor = end; + return true; + } + case WhitespaceMode::TRIM: + case WhitespaceMode::COLLAPSE: { + // Calculate the collapsed string and the corresponding + // trimmed region + size_t stringStart; + size_t stringEnd; + std::string content; + if (mode == WhitespaceMode::TRIM) { + content = Utils::trim(&buf[cursor], end - cursor, + stringStart, stringEnd); + } else { + content = Utils::collapse(&buf[cursor], end - cursor, + stringStart, stringEnd); + } + + // If the resulting string is empty (only whitespaces), + // abort + if (content.empty()) { + cursor = end; + break; + } + + // Calculate the absolute positions and return the token + stringStart += cursor; + stringEnd += cursor; + token = Token( + Tokens::Data, content, + SourceLocation(sourceId, + offsets.loadOffset(stringStart).first, + offsets.loadOffset(stringEnd).first)); + cursor = end; + return true; + } + } + } + + // If start equals end, we're currently directly at a token + // instance. Return this token and advance the cursor to the end of + // the token. + if (cursor == end && it != marks.end()) { + const size_t tokenStart = it->bufStart; + const size_t tokenEnd = it->bufStart + it->len; + token = Token( + it->id, std::string(&buf[tokenStart], it->len), + SourceLocation(sourceId, offsets.loadOffset(tokenStart).first, + offsets.loadOffset(tokenEnd).first)); + cursor = tokenEnd; + return true; + } + + // We've failed. There is no token. Only void. Reset token and return + // false. + token = Token(); + return false; + } + + /** + * Returns the current size of the internal buffer. + * + * @return the size of the internal character buffer. + */ + size_t getSize() { return buf.size(); } +}; + +/* Class TokenizedData */ + +TokenizedData::TokenizedData() : TokenizedData(InvalidSourceId) {} + +TokenizedData::TokenizedData(SourceId sourceId) + : impl(std::make_shared<TokenizedDataImpl>(sourceId)), cursor(0) +{ +} + +TokenizedData::~TokenizedData() {} + +size_t TokenizedData::append(const std::string &data, SourceOffset offsStart) +{ + return impl->append(data, offsStart); +} + +size_t TokenizedData::append(char c, SourceOffset offsStart, + SourceOffset offsEnd) +{ + return impl->append(c, offsStart, offsEnd); +} + +void TokenizedData::mark(TokenId id, TokenLength len) +{ + impl->mark(id, impl->getSize() - len, len); +} + +void TokenizedData::mark(TokenId id, size_t bufStart, TokenLength len) +{ + impl->mark(id, bufStart, len); +} + +bool TokenizedData::next(Token &token, WhitespaceMode mode) +{ + return impl->next(token, mode, tokens, cursor); +} + +bool TokenizedData::text(Token &token, WhitespaceMode mode) +{ + // Copy the current cursor position to not update the actual cursor position + // if the operation was not successful + size_t cursorCopy = cursor; + if (!impl->next(token, mode, tokens, cursorCopy) || + token.id != Tokens::Data) { + return false; + } + + // There is indeed a text token, update the internal cursor position + cursor = cursorCopy; + return true; +} +} diff --git a/src/core/parser/utils/TokenizedData.hpp b/src/core/parser/utils/TokenizedData.hpp new file mode 100644 index 0000000..38125c4 --- /dev/null +++ b/src/core/parser/utils/TokenizedData.hpp @@ -0,0 +1,189 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/** + * @file TokenizedData.hpp + * + * The TokenizedData class defined in this file stores string data extracted + * from a document including user defined tokens. Tokens can be dynamically + * enabled and disabled. And the data up to the next enabled token can be + * returned. Additionally, the data provided by the TokenizedData class is + * processed according to a whitespace mode that can be dynamically updated. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKENIZED_DATA_HPP_ +#define _OUSIA_TOKENIZED_DATA_HPP_ + +#include <cstdint> +#include <memory> +#include <unordered_set> + +#include <core/common/Location.hpp> +#include <core/common/Whitespace.hpp> + +#include "Token.hpp" + +namespace ousia { + +// Forward declaration +class TokenizedDataImpl; + +/** + * The TokenizedData class stores data extracted from a user defined document. + * As users are capable of defining their own tokens and these are only valid + * in certain scopes TokenizedData allows to divide the stored data into chunks + * separated by tokens. + */ +class TokenizedData { +private: + /** + * Shared pointer pointing at the internal data. This data is shared when + * copying TokenizedData instances, which corresponds to forking a + * TokenizedData instance. + */ + std::shared_ptr<TokenizedDataImpl> impl; + + /** + * Contains all currently enabled token ids. + */ + std::unordered_set<TokenId> tokens; + + /** + * Position from which the last element was read from the internal buffer. + * This information is not shared with the other instances of TokenizedData + * pointing at the same location. + */ + size_t cursor; + +public: + /** + * Default constructor, creates a new instance of TokenizedData, sets the + * internal SourceId to the InvalidSourceId constant. + */ + TokenizedData(); + + /** + * Creates a new instance of TokenizedData, takes a SourceId. + * + * @param sourceId is the source identifier that should be used for + * constructing the location when returning tokens. + */ + TokenizedData(SourceId sourceId); + + /** + * Destructor. Needs to be defined explicitly for freeing a shared pointer + * of the incomplete TokenizedDataImpl type. + */ + ~TokenizedData(); + + /** + * Appends a complete string to the internal character buffer. Note that the + * start and end positions for each character in the given data string will + * be interpolated and may thus be incorrect (e.g. when multi-character + * linebreaks or multi-character characters (not handled now) are read). + * + * @param data is the string that should be appended to the buffer. + * @param offsStart is the start offset in bytes in the input file. + * @return the current size of the internal byte buffer. The returned value + * is intended to be used for the "mark" function. + */ + size_t append(const std::string &data, SourceOffset offsStart = 0); + + /** + * Appends a single character to the internal character buffer. + * + * @param c is the character that should be appended to the buffer. + * @param start is the start offset in bytes in the input file. + * @param end is the end offset in bytes in the input file. + * @return the current size of the internal byte buffer. The returned value + * is intended to be used for the "mark" function. + */ + size_t append(char c, SourceOffset offsStart, SourceOffset offsEnd); + + /** + * Stores a token ending at the last character of the current buffer. + * + * @param id is the id of the token for which the mark should be stored. + * @param len is the length of the token. + */ + void mark(TokenId id, TokenLength len); + + /** + * Stores a token at the given position. + * + * @param id is the if of the token for which the mark should be stored. + * @param bufStart is the start position in the internal buffer. Use the + * values returned by append to calculate the start position. + * @param len is the length of the token. + */ + void mark(TokenId id, size_t bufStart, TokenLength len); + + /** + * Enables a single token id. Enabled tokens will no longer be returned as + * text. Instead, when querying for the next token, TokenizedData will + * return them as token and not as part of a Text token. + * + * @param id is the TokenId of the token that should be enabled. + */ + void enableToken(TokenId id) { tokens.insert(id); } + + /** + * Enables a set of token ids. Enabled tokens will no longer be returned as + * text. Instead, when querying for the next token, TokenizedData will + * return them as token and not as part of a Text token. + * + * @param ids is the TokenId of the token that should be enabled. + */ + void enableToken(const std::unordered_set<TokenId> &ids) + { + tokens.insert(ids.begin(), ids.end()); + } + + /** + * Stores the next token in the given token reference, returns true if the + * operation was successful, false if there are no more tokens. + * + * @param token is an output parameter into which the read token will be + * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param mode is the whitespace mode that should be used when a text token + * is returned. + * @return true if the operation was successful and there is a next token, + * false if there are no more tokens. + */ + bool next(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); + + /** + * Stores the next text token in the given token reference, returns true if + * the operation was successful (there was indeed a text token), false if + * the next token is not a text token or there were no more tokens. + * + * @param token is an output parameter into which the read token will be + * stored. The TokenId is set to Tokens::Empty if there are no more tokens. + * @param mode is the whitespace mode that should be used when a text token + * is returned. + * @return true if the operation was successful and there is a next token, + * false if there are no more tokens. + */ + bool text(Token &token, WhitespaceMode mode = WhitespaceMode::COLLAPSE); +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + |