summaryrefslogtreecommitdiff
path: root/src/formats
diff options
context:
space:
mode:
Diffstat (limited to 'src/formats')
-rw-r--r--src/formats/osml/OsmlStreamParser.cpp157
-rw-r--r--src/formats/osml/OsmlStreamParser.hpp85
-rw-r--r--src/formats/osxml/OsxmlEventParser.cpp63
-rw-r--r--src/formats/osxml/OsxmlEventParser.hpp31
4 files changed, 90 insertions, 246 deletions
diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp
index f61ac7d..d4cdbf8 100644
--- a/src/formats/osml/OsmlStreamParser.cpp
+++ b/src/formats/osml/OsmlStreamParser.cpp
@@ -94,92 +94,11 @@ public:
static const PlainFormatTokens OsmlTokens;
-/**
- * Class used internally to collect data issued via "DATA" event.
- */
-class DataHandler {
-private:
- /**
- * Internal character buffer.
- */
- std::vector<char> buf;
-
- /**
- * Start location of the character data.
- */
- SourceOffset start;
-
- /**
- * End location of the character data.
- */
- SourceOffset end;
-
-public:
- /**
- * Default constructor, initializes start and end with zeros.
- */
- DataHandler() : start(0), end(0) {}
-
- /**
- * Returns true if the internal buffer is empty.
- *
- * @return true if no characters were added to the internal buffer, false
- * otherwise.
- */
- bool isEmpty() { return buf.empty(); }
-
- /**
- * Appends a single character to the internal buffer.
- *
- * @param c is the character that should be added to the internal buffer.
- * @param charStart is the start position of the character.
- * @param charEnd is the end position of the character.
- */
- void append(char c, SourceOffset charStart, SourceOffset charEnd)
- {
- if (isEmpty()) {
- start = charStart;
- }
- buf.push_back(c);
- end = charEnd;
- }
-
- /**
- * Appends a string to the internal buffer.
- *
- * @param s is the string that should be added to the internal buffer.
- * @param stringStart is the start position of the string.
- * @param stringEnd is the end position of the string.
- */
- void append(const std::string &s, SourceOffset stringStart,
- SourceOffset stringEnd)
- {
- if (isEmpty()) {
- start = stringStart;
- }
- std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf));
- end = stringEnd;
- }
-
- /**
- * Converts the internal buffer to a variant with attached location
- * information.
- *
- * @param sourceId is the source id which is needed for building the
- * location information.
- * @return a Variant with the internal buffer content as string and
- * the correct start and end location.
- */
- Variant toVariant(SourceId sourceId)
- {
- Variant res = Variant::fromString(std::string(buf.data(), buf.size()));
- res.setLocation({sourceId, start, end});
- return res;
- }
-};
-
OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
- : reader(reader), logger(logger), tokenizer(OsmlTokens)
+ : reader(reader),
+ logger(logger),
+ tokenizer(OsmlTokens),
+ data(reader.getSourceId())
{
// Place an intial command representing the complete file on the stack
commands.push(Command{"", Variant::mapType{}, true, true, true, false});
@@ -188,7 +107,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger)
Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
{
bool first = true;
- bool hasCharSiceNSSep = false;
+ bool hasCharSinceNSSep = false;
std::vector<char> identifier;
size_t end = reader.getPeekOffset();
char c, c2;
@@ -197,7 +116,7 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
if ((first && Utils::isIdentifierStartCharacter(c)) ||
(!first && Utils::isIdentifierCharacter(c))) {
identifier.push_back(c);
- } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) &&
+ } else if (c == ':' && hasCharSinceNSSep && reader.fetchPeek(c2) &&
Utils::isIdentifierStartCharacter(c2)) {
identifier.push_back(c);
} else {
@@ -214,8 +133,8 @@ Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep)
// This is no longer the first character
first = false;
- // Advance the hasCharSiceNSSep flag
- hasCharSiceNSSep = allowNSSep && (c != ':');
+ // Advance the hasCharSinceNSSep flag
+ hasCharSinceNSSep = allowNSSep && (c != ':');
end = reader.getPeekOffset();
reader.consumePeek();
@@ -488,7 +407,10 @@ void OsmlStreamParser::parseBlockComment()
{
Token token;
size_t depth = 1;
- while (tokenizer.read(reader, token)) {
+ while (tokenizer.read(reader, token, data)) {
+ // Throw the comment data away
+ data.clear();
+
if (token.id == OsmlTokens.BlockCommentEnd) {
depth--;
if (depth == 0) {
@@ -514,10 +436,9 @@ void OsmlStreamParser::parseLineComment()
}
}
-bool OsmlStreamParser::checkIssueData(DataHandler &handler)
+bool OsmlStreamParser::checkIssueData()
{
- if (!handler.isEmpty()) {
- data = handler.toVariant(reader.getSourceId());
+ if (!data.empty()) {
location = data.getLocation();
reader.resetPeek();
return true;
@@ -575,12 +496,12 @@ bool OsmlStreamParser::closeField()
OsmlStreamParser::State OsmlStreamParser::parse()
{
- // Handler for incomming data
- DataHandler handler;
+ // Reset the data handler
+ data.clear();
// Read tokens until the outer loop should be left
Token token;
- while (tokenizer.peek(reader, token)) {
+ while (tokenizer.peek(reader, token, data)) {
const TokenId type = token.id;
// Special handling for Backslash and Text
@@ -606,7 +527,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()
// Try to parse a command
if (Utils::isIdentifierStartCharacter(c)) {
// Make sure to issue any data before it is to late
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
@@ -633,12 +554,11 @@ OsmlStreamParser::State OsmlStreamParser::parse()
// If this was an annotation start token, add the parsed < to the
// output
if (type == OsmlTokens.AnnotationStart) {
- handler.append('<', token.location.getStart(),
- token.location.getStart() + 1);
+ data.append('<', token.location.getStart(),
+ token.location.getStart() + 1);
}
- handler.append(c, token.location.getStart(),
- reader.getPeekOffset());
+ data.append(c, token.location.getStart(), reader.getPeekOffset());
reader.consumePeek();
continue;
} else if (type == Tokens::Data) {
@@ -647,18 +567,13 @@ OsmlStreamParser::State OsmlStreamParser::parse()
location = token.location;
return State::FIELD_START;
}
-
- // Append the text to the data handler
- handler.append(token.content, token.location.getStart(),
- token.location.getEnd());
-
reader.consumePeek();
continue;
}
// A non-text token was reached, make sure all pending data commands
// have been issued
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
@@ -676,34 +591,36 @@ OsmlStreamParser::State OsmlStreamParser::parse()
Command &cmd = commands.top();
if (!cmd.inField) {
cmd.inField = true;
- return State::FIELD_START;
}
- logger.error(
+ return State::FIELD_START;
+/* logger.error(
"Got field start token \"{\", but no command for which to "
"start the field. Write \"\\{\" to insert this sequence as "
"text.",
- token);
+ token);*/
} else if (token.id == OsmlTokens.FieldEnd) {
- if (closeField()) {
+ closeField();
+ return State::FIELD_END;
+/* if (closeField()) {
return State::FIELD_END;
}
logger.error(
"Got field end token \"}\", but there is no field to end. "
"Write \"\\}\" to insert this sequence as text.",
- token);
+ token);*/
} else if (token.id == OsmlTokens.DefaultFieldStart) {
// Try to start a default field the first time the token is reached
Command &topCmd = commands.top();
if (!topCmd.inField) {
topCmd.inField = true;
topCmd.inDefaultField = true;
- return State::FIELD_START;
}
- logger.error(
+ return State::FIELD_START;
+/* logger.error(
"Got default field start token \"{!\", but no command for "
"which to start the field. Write \"\\{!\" to insert this "
"sequence as text",
- token);
+ token);*/
} else if (token.id == OsmlTokens.AnnotationEnd) {
// We got a single annotation end token "\>" -- simply issue the
// ANNOTATION_END event
@@ -717,7 +634,7 @@ OsmlStreamParser::State OsmlStreamParser::parse()
}
// Issue available data
- if (checkIssueData(handler)) {
+ if (checkIssueData()) {
return State::DATA;
}
@@ -737,6 +654,14 @@ OsmlStreamParser::State OsmlStreamParser::parse()
return State::END;
}
+Variant OsmlStreamParser::getText(WhitespaceMode mode)
+{
+ TokenizedData dataFork = data;
+ Variant text = dataFork.text(mode);
+ location = text.getLocation();
+ return text;
+}
+
const Variant &OsmlStreamParser::getCommandName() const
{
return commands.top().name;
diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp
index dc3034c..453a2bb 100644
--- a/src/formats/osml/OsmlStreamParser.hpp
+++ b/src/formats/osml/OsmlStreamParser.hpp
@@ -29,17 +29,19 @@
#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_
#define _OUSIA_OSML_STREAM_PARSER_HPP_
-#include <stack>
+#include <memory>
#include <core/common/Variant.hpp>
+#include <core/common/Whitespace.hpp>
#include <core/parser/utils/Tokenizer.hpp>
+#include <core/parser/utils/TokenizedData.hpp>
namespace ousia {
// Forward declarations
class CharReader;
class Logger;
-class DataHandler;
+class OsmlStreamParserImpl;
/**
* The OsmlStreamParser class provides a low-level reader for the TeX-esque osml
@@ -137,26 +139,15 @@ public:
Variant arguments;
/**
- * Set to true if this is a command with clear begin and end.
- */
- bool hasRange : 1;
-
- /**
- * Set to true if we are currently inside a field of this command.
- */
- bool inField : 1;
-
- /**
- * Set to true if we are currently in the range field of the command
- * (implies inField being set to true).
+ * Vector used as stack for holding the number of opening/closing braces
+ * and the corresponding "isDefaultField" flag.
*/
- bool inRangeField : 1;
+ std::vector<bool> fields;
/**
- * Set to true if we are currently in a field that has been especially
- * marked as default field (using the "|") syntax.
+ * Set to true if this is a command with clear begin and end.
*/
- bool inDefaultField : 1;
+ bool hasRange;
/**
* Default constructor.
@@ -164,7 +155,6 @@ public:
Command()
: hasRange(false),
inField(false),
- inRangeField(false),
inDefaultField()
{
}
@@ -178,15 +168,10 @@ public:
* command.
* @param hasRange should be set to true if this is a command with
* explicit range.
- * @param inField is set to true if we currently are inside a field
- * of this command.
- * @param inRangeField is set to true if we currently are inside the
- * outer field of a ranged command.
* @param inDefaultField is set to true if we currently are in a
* specially marked default field.
*/
- Command(Variant name, Variant arguments, bool hasRange,
- bool inField, bool inRangeField, bool inDefaultField)
+ Command(Variant name, Variant arguments, bool hasRange)
: name(std::move(name)),
arguments(std::move(arguments)),
hasRange(hasRange),
@@ -215,25 +200,20 @@ private:
Tokenizer tokenizer;
/**
- * Stack containing the current commands.
- */
- std::stack<Command> commands;
-
- /**
- * Variant containing the data that has been read (always is a string,
- * contains the exact location of the data in the source file).
+ * Variant containing the tokenized data that was returned from the
+ * tokenizer as data.
*/
- Variant data;
+ TokenizedData data;
/**
- * Contains the location of the last token.
+ * Stack containing the current commands.
*/
- SourceLocation location;
+ std::stack<Command> commands;
/**
- * Contains the field index of the current command.
+ * Pointer at
*/
- size_t fieldIdx;
+ std::unique_ptr<OsmlStreamParserImpl> impl;
/**
* Function used internall to parse an identifier.
@@ -291,12 +271,10 @@ private:
/**
* Checks whether there is any data pending to be issued, if yes, issues it.
*
- * @param handler is the data handler that contains the data that may be
- * returned to the user.
* @return true if there was any data and DATA should be returned by the
* parse function, false otherwise.
*/
- bool checkIssueData(DataHandler &handler);
+ bool checkIssueData();
/**
* Called before any data is appended to the internal data handler. Checks
@@ -328,6 +306,12 @@ public:
OsmlStreamParser(CharReader &reader, Logger &logger);
/**
+ * Destructor of the OsmlStreamParser, needed to destroy the incomplete
+ * OsmlStreamParserImpl.
+ */
+ ~OsmlStreamParser();
+
+ /**
* Continues parsing. Returns one of the states defined in the State enum.
* Callers should stop once the State::END state is reached. Use the getter
* functions to get more information about the current state, such as the
@@ -344,7 +328,19 @@ public:
* @return a reference at a variant containing the data parsed by the
* "parse" function.
*/
- const Variant &getData() const { return data; }
+ const TokenizedData &getData() const { return data; }
+
+ /**
+ * Returns the complete content of the internal TokenizedData instance as
+ * a single string Variant. This method is mainly used in the unit tests for
+ * this class, it simply calls the text() method of TokenizedData.
+ *
+ * @param mode is the WhitespaceMode that should be used for returning the
+ * text.
+ * @return a string variant containing the text content of the internal
+ * TokenizedData instance or a nullptr variant if there is no text.
+ */
+ Variant getText(WhitespaceMode mode = WhitespaceMode::COLLAPSE);
/**
* Returns a reference at the internally stored command name. Only valid if
@@ -371,13 +367,6 @@ public:
* syntax).
*/
bool inDefaultField() const;
-
- /**
- * Returns a reference at the char reader.
- *
- * @return the last internal token location.
- */
- const SourceLocation &getLocation() const { return location; }
};
}
diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp
index c9254b0..855f80d 100644
--- a/src/formats/osxml/OsxmlEventParser.cpp
+++ b/src/formats/osxml/OsxmlEventParser.cpp
@@ -25,7 +25,6 @@
#include <core/common/Variant.hpp>
#include <core/common/VariantReader.hpp>
#include <core/common/Utils.hpp>
-#include <core/common/WhitespaceHandler.hpp>
#include "OsxmlAttributeLocator.hpp"
#include "OsxmlEventParser.hpp"
@@ -57,17 +56,6 @@ public:
std::vector<char> textBuf;
/**
- * Current whitespace buffer (for the trimming whitspace mode)
- */
- std::vector<char> whitespaceBuf;
-
- /**
- * Flag indicating whether a whitespace character was present (for the
- * collapsing whitespace mode).
- */
- bool hasWhitespace;
-
- /**
* Current character data start.
*/
size_t textStart;
@@ -394,33 +382,17 @@ static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len)
SourceLocation loc = xmlSyncLoggerPosition(p, ulen);
// Fetch some variables for convenience
- const WhitespaceMode mode = parser->getWhitespaceMode();
OsxmlEventParserData &data = parser->getData();
std::vector<char> &textBuf = data.textBuf;
- std::vector<char> &whitespaceBuf = data.whitespaceBuf;
- bool &hasWhitespace = data.hasWhitespace;
- size_t &textStart = data.textStart;
- size_t &textEnd = data.textEnd;
-
- size_t pos = loc.getStart();
- for (size_t i = 0; i < ulen; i++, pos++) {
- switch (mode) {
- case WhitespaceMode::PRESERVE:
- PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd);
- break;
- case WhitespaceMode::TRIM:
- TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- whitespaceBuf);
- break;
- case WhitespaceMode::COLLAPSE:
- CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf,
- textStart, textEnd,
- hasWhitespace);
- break;
- }
+
+ // Update start and end position
+ if (textBuf.empty()) {
+ data.textStart = loc.getStart();
}
+ data.textEnd = loc.getEnd();
+
+ // Insert the data into the text buffer
+ textBuf.insert(textBuf.end(), &s[0], &s[ulen]);
}
/* Class OsxmlEvents */
@@ -430,11 +402,7 @@ OsxmlEvents::~OsxmlEvents() {}
/* Class OsxmlEventParser */
OsxmlEventParserData::OsxmlEventParserData()
- : depth(0),
- annotationEndTagDepth(-1),
- hasWhitespace(false),
- textStart(0),
- textEnd(0)
+ : depth(0), annotationEndTagDepth(-1), textStart(0), textEnd(0)
{
}
@@ -466,8 +434,6 @@ Variant OsxmlEventParserData::getText(SourceId sourceId)
// Reset the text buffers
textBuf.clear();
- whitespaceBuf.clear();
- hasWhitespace = false;
textStart = 0;
textEnd = 0;
@@ -482,7 +448,6 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events,
: reader(reader),
events(events),
logger(logger),
- whitespaceMode(WhitespaceMode::COLLAPSE),
data(new OsxmlEventParserData())
{
}
@@ -532,16 +497,6 @@ void OsxmlEventParser::parse()
}
}
-void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode)
-{
- this->whitespaceMode = whitespaceMode;
-}
-
-WhitespaceMode OsxmlEventParser::getWhitespaceMode() const
-{
- return whitespaceMode;
-}
-
CharReader &OsxmlEventParser::getReader() const { return reader; }
Logger &OsxmlEventParser::getLogger() const { return logger; }
diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp
index e39245f..e3fd5d4 100644
--- a/src/formats/osxml/OsxmlEventParser.hpp
+++ b/src/formats/osxml/OsxmlEventParser.hpp
@@ -32,8 +32,6 @@
#include <memory>
#include <string>
-#include <core/common/Whitespace.hpp>
-
namespace ousia {
// Forward declarations
@@ -99,13 +97,10 @@ public:
virtual void fieldEnd() = 0;
/**
- * Called whenever data is found. Whitespace data is handled as specified
- * and the data has been parsed to the specified variant type. This function
- * is not called if the parsing failed, the parser prints an error message
- * instead.
+ * Called whenever string data is found.
*
- * @param data is the already parsed data that should be passed to the
- * handler.
+ * @param data is a Variant containing the string data that was found in the
+ * XML file.
*/
virtual void data(const Variant &data) = 0;
};
@@ -135,11 +130,6 @@ private:
Logger &logger;
/**
- * Current whitespace mode.
- */
- WhitespaceMode whitespaceMode;
-
- /**
* Data to be used by the internal functions.
*/
std::unique_ptr<OsxmlEventParserData> data;
@@ -171,21 +161,6 @@ public:
void parse();
/**
- * Sets the whitespace handling mode.
- *
- * @param whitespaceMode defines how whitespace in the data should be
- * handled.
- */
- void setWhitespaceMode(WhitespaceMode whitespaceMode);
-
- /**
- * Returns the current whitespace handling mode.
- *
- * @return the currently set whitespace handling mode.
- */
- WhitespaceMode getWhitespaceMode() const;
-
- /**
* Returns the internal CharReader reference.
*
* @return the CharReader reference.