diff options
Diffstat (limited to 'src/formats')
-rw-r--r-- | src/formats/osml/OsmlStreamParser.cpp | 157 | ||||
-rw-r--r-- | src/formats/osml/OsmlStreamParser.hpp | 85 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlEventParser.cpp | 63 | ||||
-rw-r--r-- | src/formats/osxml/OsxmlEventParser.hpp | 31 |
4 files changed, 90 insertions, 246 deletions
diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index f61ac7d..d4cdbf8 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -94,92 +94,11 @@ public: static const PlainFormatTokens OsmlTokens; -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: - /** - * Internal character buffer. - */ - std::vector<char> buf; - - /** - * Start location of the character data. - */ - SourceOffset start; - - /** - * End location of the character data. - */ - SourceOffset end; - -public: - /** - * Default constructor, initializes start and end with zeros. - */ - DataHandler() : start(0), end(0) {} - - /** - * Returns true if the internal buffer is empty. - * - * @return true if no characters were added to the internal buffer, false - * otherwise. - */ - bool isEmpty() { return buf.empty(); } - - /** - * Appends a single character to the internal buffer. - * - * @param c is the character that should be added to the internal buffer. - * @param charStart is the start position of the character. - * @param charEnd is the end position of the character. - */ - void append(char c, SourceOffset charStart, SourceOffset charEnd) - { - if (isEmpty()) { - start = charStart; - } - buf.push_back(c); - end = charEnd; - } - - /** - * Appends a string to the internal buffer. - * - * @param s is the string that should be added to the internal buffer. - * @param stringStart is the start position of the string. - * @param stringEnd is the end position of the string. - */ - void append(const std::string &s, SourceOffset stringStart, - SourceOffset stringEnd) - { - if (isEmpty()) { - start = stringStart; - } - std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); - end = stringEnd; - } - - /** - * Converts the internal buffer to a variant with attached location - * information. - * - * @param sourceId is the source id which is needed for building the - * location information. - * @return a Variant with the internal buffer content as string and - * the correct start and end location. - */ - Variant toVariant(SourceId sourceId) - { - Variant res = Variant::fromString(std::string(buf.data(), buf.size())); - res.setLocation({sourceId, start, end}); - return res; - } -}; - OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) - : reader(reader), logger(logger), tokenizer(OsmlTokens) + : reader(reader), + logger(logger), + tokenizer(OsmlTokens), + data(reader.getSourceId()) { // Place an intial command representing the complete file on the stack commands.push(Command{"", Variant::mapType{}, true, true, true, false}); @@ -188,7 +107,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; - bool hasCharSiceNSSep = false; + bool hasCharSinceNSSep = false; std::vector<char> identifier; size_t end = reader.getPeekOffset(); char c, c2; @@ -197,7 +116,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) if ((first && Utils::isIdentifierStartCharacter(c)) || (!first && Utils::isIdentifierCharacter(c))) { identifier.push_back(c); - } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && + } else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) && Utils::isIdentifierStartCharacter(c2)) { identifier.push_back(c); } else { @@ -214,8 +133,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) // This is no longer the first character first = false; - // Advance the hasCharSiceNSSep flag - hasCharSiceNSSep = allowNSSep && (c != ':'); + // Advance the hasCharSinceNSSep flag + hasCharSinceNSSep = allowNSSep && (c != ':'); end = reader.getPeekOffset(); reader.consumePeek(); @@ -488,7 +407,10 @@ void OsmlStreamParser::parseBlockComment() { Token token; size_t depth = 1; - while (tokenizer.read(reader, token)) { + while (tokenizer.read(reader, token, data)) { + // Throw the comment data away + data.clear(); + if (token.id == OsmlTokens.BlockCommentEnd) { depth--; if (depth == 0) { @@ -514,10 +436,9 @@ void OsmlStreamParser::parseLineComment() } } -bool OsmlStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData() { - if (!handler.isEmpty()) { - data = handler.toVariant(reader.getSourceId()); + if (!data.empty()) { location = data.getLocation(); reader.resetPeek(); return true; @@ -575,12 +496,12 @@ bool OsmlStreamParser::closeField() OsmlStreamParser::State OsmlStreamParser::parse() { - // Handler for incomming data - DataHandler handler; + // Reset the data handler + data.clear(); // Read tokens until the outer loop should be left Token token; - while (tokenizer.peek(reader, token)) { + while (tokenizer.peek(reader, token, data)) { const TokenId type = token.id; // Special handling for Backslash and Text @@ -606,7 +527,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() // Try to parse a command if (Utils::isIdentifierStartCharacter(c)) { // Make sure to issue any data before it is to late - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -633,12 +554,11 @@ OsmlStreamParser::State OsmlStreamParser::parse() // If this was an annotation start token, add the parsed < to the // output if (type == OsmlTokens.AnnotationStart) { - handler.append('<', token.location.getStart(), - token.location.getStart() + 1); + data.append('<', token.location.getStart(), + token.location.getStart() + 1); } - handler.append(c, token.location.getStart(), - reader.getPeekOffset()); + data.append(c, token.location.getStart(), reader.getPeekOffset()); reader.consumePeek(); continue; } else if (type == Tokens::Data) { @@ -647,18 +567,13 @@ OsmlStreamParser::State OsmlStreamParser::parse() location = token.location; return State::FIELD_START; } - - // Append the text to the data handler - handler.append(token.content, token.location.getStart(), - token.location.getEnd()); - reader.consumePeek(); continue; } // A non-text token was reached, make sure all pending data commands // have been issued - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -676,34 +591,36 @@ OsmlStreamParser::State OsmlStreamParser::parse() Command &cmd = commands.top(); if (!cmd.inField) { cmd.inField = true; - return State::FIELD_START; } - logger.error( + return State::FIELD_START; +/* logger.error( "Got field start token \"{\", but no command for which to " "start the field. Write \"\\{\" to insert this sequence as " "text.", - token); + token);*/ } else if (token.id == OsmlTokens.FieldEnd) { - if (closeField()) { + closeField(); + return State::FIELD_END; +/* if (closeField()) { return State::FIELD_END; } logger.error( "Got field end token \"}\", but there is no field to end. " "Write \"\\}\" to insert this sequence as text.", - token); + token);*/ } else if (token.id == OsmlTokens.DefaultFieldStart) { // Try to start a default field the first time the token is reached Command &topCmd = commands.top(); if (!topCmd.inField) { topCmd.inField = true; topCmd.inDefaultField = true; - return State::FIELD_START; } - logger.error( + return State::FIELD_START; +/* logger.error( "Got default field start token \"{!\", but no command for " "which to start the field. Write \"\\{!\" to insert this " "sequence as text", - token); + token);*/ } else if (token.id == OsmlTokens.AnnotationEnd) { // We got a single annotation end token "\>" -- simply issue the // ANNOTATION_END event @@ -717,7 +634,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() } // Issue available data - if (checkIssueData(handler)) { + if (checkIssueData()) { return State::DATA; } @@ -737,6 +654,14 @@ OsmlStreamParser::State OsmlStreamParser::parse() return State::END; } +Variant OsmlStreamParser::getText(WhitespaceMode mode) +{ + TokenizedData dataFork = data; + Variant text = dataFork.text(mode); + location = text.getLocation(); + return text; +} + const Variant &OsmlStreamParser::getCommandName() const { return commands.top().name; diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index dc3034c..453a2bb 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -29,17 +29,19 @@ #ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ #define _OUSIA_OSML_STREAM_PARSER_HPP_ -#include <stack> +#include <memory> #include <core/common/Variant.hpp> +#include <core/common/Whitespace.hpp> #include <core/parser/utils/Tokenizer.hpp> +#include <core/parser/utils/TokenizedData.hpp> namespace ousia { // Forward declarations class CharReader; class Logger; -class DataHandler; +class OsmlStreamParserImpl; /** * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml @@ -137,26 +139,15 @@ public: Variant arguments; /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange : 1; - - /** - * Set to true if we are currently inside a field of this command. - */ - bool inField : 1; - - /** - * Set to true if we are currently in the range field of the command - * (implies inField being set to true). + * Vector used as stack for holding the number of opening/closing braces + * and the corresponding "isDefaultField" flag. */ - bool inRangeField : 1; + std::vector<bool> fields; /** - * Set to true if we are currently in a field that has been especially - * marked as default field (using the "|") syntax. + * Set to true if this is a command with clear begin and end. */ - bool inDefaultField : 1; + bool hasRange; /** * Default constructor. @@ -164,7 +155,6 @@ public: Command() : hasRange(false), inField(false), - inRangeField(false), inDefaultField() { } @@ -178,15 +168,10 @@ public: * command. * @param hasRange should be set to true if this is a command with * explicit range. - * @param inField is set to true if we currently are inside a field - * of this command. - * @param inRangeField is set to true if we currently are inside the - * outer field of a ranged command. * @param inDefaultField is set to true if we currently are in a * specially marked default field. */ - Command(Variant name, Variant arguments, bool hasRange, - bool inField, bool inRangeField, bool inDefaultField) + Command(Variant name, Variant arguments, bool hasRange) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), @@ -215,25 +200,20 @@ private: Tokenizer tokenizer; /** - * Stack containing the current commands. - */ - std::stack<Command> commands; - - /** - * Variant containing the data that has been read (always is a string, - * contains the exact location of the data in the source file). + * Variant containing the tokenized data that was returned from the + * tokenizer as data. */ - Variant data; + TokenizedData data; /** - * Contains the location of the last token. + * Stack containing the current commands. */ - SourceLocation location; + std::stack<Command> commands; /** - * Contains the field index of the current command. + * Pointer at */ - size_t fieldIdx; + std::unique_ptr<OsmlStreamParserImpl> impl; /** * Function used internall to parse an identifier. @@ -291,12 +271,10 @@ private: /** * Checks whether there is any data pending to be issued, if yes, issues it. * - * @param handler is the data handler that contains the data that may be - * returned to the user. * @return true if there was any data and DATA should be returned by the * parse function, false otherwise. */ - bool checkIssueData(DataHandler &handler); + bool checkIssueData(); /** * Called before any data is appended to the internal data handler. Checks @@ -328,6 +306,12 @@ public: OsmlStreamParser(CharReader &reader, Logger &logger); /** + * Destructor of the OsmlStreamParser, needed to destroy the incomplete + * OsmlStreamParserImpl. + */ + ~OsmlStreamParser(); + + /** * Continues parsing. Returns one of the states defined in the State enum. * Callers should stop once the State::END state is reached. Use the getter * functions to get more information about the current state, such as the @@ -344,7 +328,19 @@ public: * @return a reference at a variant containing the data parsed by the * "parse" function. */ - const Variant &getData() const { return data; } + const TokenizedData &getData() const { return data; } + + /** + * Returns the complete content of the internal TokenizedData instance as + * a single string Variant. This method is mainly used in the unit tests for + * this class, it simply calls the text() method of TokenizedData. + * + * @param mode is the WhitespaceMode that should be used for returning the + * text. + * @return a string variant containing the text content of the internal + * TokenizedData instance or a nullptr variant if there is no text. + */ + Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE); /** * Returns a reference at the internally stored command name. Only valid if @@ -371,13 +367,6 @@ public: * syntax). */ bool inDefaultField() const; - - /** - * Returns a reference at the char reader. - * - * @return the last internal token location. - */ - const SourceLocation &getLocation() const { return location; } }; } diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index c9254b0..855f80d 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -25,7 +25,6 @@ #include <core/common/Variant.hpp> #include <core/common/VariantReader.hpp> #include <core/common/Utils.hpp> -#include <core/common/WhitespaceHandler.hpp> #include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" @@ -57,17 +56,6 @@ public: std::vector<char> textBuf; /** - * Current whitespace buffer (for the trimming whitspace mode) - */ - std::vector<char> whitespaceBuf; - - /** - * Flag indicating whether a whitespace character was present (for the - * collapsing whitespace mode). - */ - bool hasWhitespace; - - /** * Current character data start. */ size_t textStart; @@ -394,33 +382,17 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) SourceLocation loc = xmlSyncLoggerPosition(p, ulen); // Fetch some variables for convenience - const WhitespaceMode mode = parser->getWhitespaceMode(); OsxmlEventParserData &data = parser->getData(); std::vector<char> &textBuf = data.textBuf; - std::vector<char> &whitespaceBuf = data.whitespaceBuf; - bool &hasWhitespace = data.hasWhitespace; - size_t &textStart = data.textStart; - size_t &textEnd = data.textEnd; - - size_t pos = loc.getStart(); - for (size_t i = 0; i < ulen; i++, pos++) { - switch (mode) { - case WhitespaceMode::PRESERVE: - PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd); - break; - case WhitespaceMode::TRIM: - TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - whitespaceBuf); - break; - case WhitespaceMode::COLLAPSE: - CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, - textStart, textEnd, - hasWhitespace); - break; - } + + // Update start and end position + if (textBuf.empty()) { + data.textStart = loc.getStart(); } + data.textEnd = loc.getEnd(); + + // Insert the data into the text buffer + textBuf.insert(textBuf.end(), &s[0], &s[ulen]); } /* Class OsxmlEvents */ @@ -430,11 +402,7 @@ OsxmlEvents::~OsxmlEvents() {} /* Class OsxmlEventParser */ OsxmlEventParserData::OsxmlEventParserData() - : depth(0), - annotationEndTagDepth(-1), - hasWhitespace(false), - textStart(0), - textEnd(0) + : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0) { } @@ -466,8 +434,6 @@ Variant OsxmlEventParserData::getText(SourceId sourceId) // Reset the text buffers textBuf.clear(); - whitespaceBuf.clear(); - hasWhitespace = false; textStart = 0; textEnd = 0; @@ -482,7 +448,6 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, : reader(reader), events(events), logger(logger), - whitespaceMode(WhitespaceMode::COLLAPSE), data(new OsxmlEventParserData()) { } @@ -532,16 +497,6 @@ void OsxmlEventParser::parse() } } -void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) -{ - this->whitespaceMode = whitespaceMode; -} - -WhitespaceMode OsxmlEventParser::getWhitespaceMode() const -{ - return whitespaceMode; -} - CharReader &OsxmlEventParser::getReader() const { return reader; } Logger &OsxmlEventParser::getLogger() const { return logger; } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index e39245f..e3fd5d4 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -32,8 +32,6 @@ #include <memory> #include <string> -#include <core/common/Whitespace.hpp> - namespace ousia { // Forward declarations @@ -99,13 +97,10 @@ public: virtual void fieldEnd() = 0; /** - * Called whenever data is found. Whitespace data is handled as specified - * and the data has been parsed to the specified variant type. This function - * is not called if the parsing failed, the parser prints an error message - * instead. + * Called whenever string data is found. * - * @param data is the already parsed data that should be passed to the - * handler. + * @param data is a Variant containing the string data that was found in the + * XML file. */ virtual void data(const Variant &data) = 0; }; @@ -135,11 +130,6 @@ private: Logger &logger; /** - * Current whitespace mode. - */ - WhitespaceMode whitespaceMode; - - /** * Data to be used by the internal functions. */ std::unique_ptr<OsxmlEventParserData> data; @@ -171,21 +161,6 @@ public: void parse(); /** - * Sets the whitespace handling mode. - * - * @param whitespaceMode defines how whitespace in the data should be - * handled. - */ - void setWhitespaceMode(WhitespaceMode whitespaceMode); - - /** - * Returns the current whitespace handling mode. - * - * @return the currently set whitespace handling mode. - */ - WhitespaceMode getWhitespaceMode() const; - - /** * Returns the internal CharReader reference. * * @return the CharReader reference. |