From efe60ac3c3a8725ac71329c0bb19fa9d9c58f399 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:42:05 +0100 Subject: Moved specific file format parsers to formats/ folder, moved old tokenizer to css code (this is the only place where it is actually used) --- src/formats/osdmx/OsdmxParser.cpp | 1435 +++++++++++++++++++++++++++++++++++++ src/formats/osdmx/OsdmxParser.hpp | 55 ++ 2 files changed, 1490 insertions(+) create mode 100644 src/formats/osdmx/OsdmxParser.cpp create mode 100644 src/formats/osdmx/OsdmxParser.hpp (limited to 'src/formats') diff --git a/src/formats/osdmx/OsdmxParser.cpp b/src/formats/osdmx/OsdmxParser.cpp new file mode 100644 index 0000000..c46d9de --- /dev/null +++ b/src/formats/osdmx/OsdmxParser.cpp @@ -0,0 +1,1435 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "XmlParser.hpp" + +namespace ousia { + +/* HeadNode Helper class */ + +namespace { +class HeadNode : public Node { +public: + using Node::Node; +}; +} + +namespace RttiTypes { +static Rtti HeadNode = RttiBuilder("HeadNode"); +} + +/* Element Handler Classes */ + +class DocumentHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted document = + project()->createDocument(args["name"].asString()); + document->setLocation(location()); + scope().push(document); + scope().setFlag(ParserFlag::POST_HEAD, false); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DocumentHandler{handlerData}; + } +}; + +class DocumentField : public Node { +public: + DocumentField(Manager &mgr, std::string name, Handle parent) + : Node(mgr, name, parent) + { + } +}; + +namespace RttiTypes { +const Rtti DocumentField = + RttiBuilder("DocumentField").parent(&Node); +} + +class DocumentChildHandler : public Handler { +public: + using Handler::Handler; + + void preamble(Handle parentNode, std::string &fieldName, + DocumentEntity *&parent, bool &inField) + { + // check if the parent in the structure tree was an explicit field + // reference. + inField = parentNode->isa(&RttiTypes::DocumentField); + if (inField) { + fieldName = parentNode->getName(); + parentNode = scope().selectOrThrow( + {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); + } else { + // if it wasn't an explicit reference, we use the default field. + fieldName = DEFAULT_FIELD_NAME; + } + // reference the parent entity explicitly. + parent = nullptr; + if (parentNode->isa(&RttiTypes::StructuredEntity)) { + parent = static_cast( + parentNode.cast().get()); + } else if (parentNode->isa(&RttiTypes::AnnotationEntity)) { + parent = static_cast( + parentNode.cast().get()); + } + } + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + Rooted parentNode = scope().selectOrThrow( + {&RttiTypes::Document, &RttiTypes::StructuredEntity, + &RttiTypes::AnnotationEntity, &RttiTypes::DocumentField}); + + std::string fieldName; + DocumentEntity *parent; + bool inField; + + preamble(parentNode, fieldName, parent, inField); + + // try to find a FieldDescriptor for the given tag if we are not in a + // field already. + // TODO: Consider fields of transparent classes + if (!inField && parent != nullptr && + parent->getDescriptor()->hasField(name())) { + Rooted field{new DocumentField( + parentNode->getManager(), fieldName, parentNode)}; + field->setLocation(location()); + scope().push(field); + return; + } + + // Otherwise create a new StructuredEntity + // TODO: Consider Anchors and AnnotationEntities + Rooted strct = scope().resolve( + Utils::split(name(), ':'), logger()); + if (strct == nullptr) { + // if we could not resolve the name, throw an exception. + throw LoggableException( + std::string("\"") + name() + "\" could not be resolved.", + location()); + } + + std::string name; + auto it = args.find("name"); + if (it != args.end()) { + name = it->second.asString(); + args.erase(it); + } + + Rooted entity; + if (parentNode->isa(&RttiTypes::Document)) { + entity = parentNode.cast()->createRootStructuredEntity( + strct, args, name); + } else { + // calculate a path if transparent entities are needed in between. + auto path = parent->getDescriptor()->pathTo(strct); + if (path.empty()) { + throw LoggableException( + std::string("An instance of \"") + strct->getName() + + "\" is not allowed as child of an instance of \"" + + parent->getDescriptor()->getName() + "\"", + location()); + } + + // create all transparent entities until the last field. + for (size_t p = 1; p < path.size() - 1; p = p + 2) { + parent = static_cast( + parent->createChildStructuredEntity( + path[p].cast(), + Variant::mapType{}, path[p - 1]->getName(), + "").get()); + } + entity = parent->createChildStructuredEntity(strct, args, fieldName, + name); + } + entity->setLocation(location()); + scope().push(entity); + } + + void end() override { scope().pop(); } + + void data(const std::string &data, int fieldIdx) override + { + Rooted parentNode = scope().selectOrThrow( + {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, + &RttiTypes::DocumentField}); + + std::string fieldName; + DocumentEntity *parent; + bool inField; + + preamble(parentNode, fieldName, parent, inField); + + // retrieve the correct FieldDescriptor. + // TODO: Consider fields of transparent classes + Rooted desc = parent->getDescriptor(); + Rooted field = desc->getFieldDescriptor(fieldName); + if (field == nullptr) { + logger().error( + std::string("Can't handle data because no field with name \"") + + fieldName + "\" exists in descriptor\"" + desc->getName() + + "\".", + location()); + return; + } + if (!field->isPrimitive()) { + logger().error(std::string("Can't handle data because field \"") + + fieldName + "\" of descriptor \"" + + desc->getName() + "\" is not primitive!", + location()); + return; + } + + // try to parse the content. + auto res = VariantReader::parseGenericString( + data, logger(), location().getSourceId(), location().getStart()); + if (!res.first) { + return; + } + // try to convert it to the correct type. + if (!field->getPrimitiveType()->build(res.second, logger())) { + return; + } + // add it as primitive content. + parent->createChildDocumentPrimitive(res.second, fieldName); + } + + static Handler *create(const HandlerData &handlerData) + { + return new DocumentChildHandler{handlerData}; + } +}; + +class TypesystemHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Create the typesystem instance + Rooted typesystem = + project()->createTypesystem(args["name"].asString()); + typesystem->setLocation(location()); + + // Push the typesystem onto the scope, set the POST_HEAD flag to true + scope().push(typesystem); + scope().setFlag(ParserFlag::POST_HEAD, false); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemHandler{handlerData}; + } +}; + +class TypesystemEnumHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Fetch the current typesystem and create the enum node + Rooted typesystem = scope().selectOrThrow(); + Rooted enumType = + typesystem->createEnumType(args["name"].asString()); + enumType->setLocation(location()); + + scope().push(enumType); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemEnumHandler{handlerData}; + } +}; + +class TypesystemEnumEntryHandler : public Handler { +public: + using Handler::Handler; + + std::string entry; + + void start(Variant::mapType &args) override {} + + void end() override + { + Rooted enumType = scope().selectOrThrow(); + enumType->addEntry(entry, logger()); + } + + void data(const std::string &data, int field) override + { + if (field != 0) { + // TODO: This should be stored in the HandlerData + logger().error("Enum entry only has one field."); + return; + } + entry.append(data); + } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemEnumEntryHandler{handlerData}; + } +}; + +class TypesystemStructHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Fetch the arguments used for creating this type + const std::string &name = args["name"].asString(); + const std::string &parent = args["parent"].asString(); + + // Fetch the current typesystem and create the struct node + Rooted typesystem = scope().selectOrThrow(); + Rooted structType = typesystem->createStructType(name); + structType->setLocation(location()); + + // Try to resolve the parent type and set it as parent structure + if (!parent.empty()) { + scope().resolve( + parent, structType, logger(), + [](Handle parent, Handle structType, + Logger &logger) { + if (parent != nullptr) { + structType.cast()->setParentStructure( + parent.cast(), logger); + } + }); + } + scope().push(structType); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemStructHandler{handlerData}; + } +}; + +class TypesystemStructFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Read the argument values + const std::string &name = args["name"].asString(); + const std::string &type = args["type"].asString(); + const Variant &defaultValue = args["default"]; + const bool optional = + !(defaultValue.isObject() && defaultValue.asObject() == nullptr); + + Rooted structType = scope().selectOrThrow(); + Rooted attribute = + structType->createAttribute(name, defaultValue, optional, logger()); + attribute->setLocation(location()); + + // Try to resolve the type and default value + if (optional) { + scope().resolveTypeWithValue( + type, attribute, attribute->getDefaultValue(), logger(), + [](Handle type, Handle attribute, Logger &logger) { + if (type != nullptr) { + attribute.cast()->setType(type.cast(), + logger); + } + }); + } else { + scope().resolveType( + type, attribute, logger(), + [](Handle type, Handle attribute, Logger &logger) { + if (type != nullptr) { + attribute.cast()->setType(type.cast(), + logger); + } + }); + } + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemStructFieldHandler{handlerData}; + } +}; + +class TypesystemConstantHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Read the argument values + const std::string &name = args["name"].asString(); + const std::string &type = args["type"].asString(); + const Variant &value = args["value"]; + + Rooted typesystem = scope().selectOrThrow(); + Rooted constant = typesystem->createConstant(name, value); + constant->setLocation(location()); + + // Try to resolve the type + scope().resolveTypeWithValue( + type, constant, constant->getValue(), logger(), + [](Handle type, Handle constant, Logger &logger) { + if (type != nullptr) { + constant.cast()->setType(type.cast(), + logger); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemConstantHandler{handlerData}; + } +}; + +/* + * Domain Handlers + */ + +class DomainHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted domain = + project()->createDomain(args["name"].asString()); + domain->setLocation(location()); + + scope().push(domain); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainHandler{handlerData}; + } +}; + +class DomainStructHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + Rooted domain = scope().selectOrThrow(); + + Rooted structuredClass = domain->createStructuredClass( + args["name"].asString(), args["cardinality"].asCardinality(), + nullptr, args["transparent"].asBool(), args["isRoot"].asBool()); + structuredClass->setLocation(location()); + + const std::string &isa = args["isa"].asString(); + if (!isa.empty()) { + scope().resolve( + isa, structuredClass, logger(), + [](Handle superclass, Handle structuredClass, + Logger &logger) { + if (superclass != nullptr) { + structuredClass.cast()->setSuperclass( + superclass.cast(), logger); + } + }); + } + + scope().push(structuredClass); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainStructHandler{handlerData}; + } +}; + +class DomainAnnotationHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + Rooted domain = scope().selectOrThrow(); + + Rooted annotationClass = + domain->createAnnotationClass(args["name"].asString()); + annotationClass->setLocation(location()); + + scope().push(annotationClass); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainAnnotationHandler{handlerData}; + } +}; + +class DomainAttributesHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Fetch the current typesystem and create the struct node + Rooted parent = scope().selectOrThrow(); + + Rooted attrDesc = parent->getAttributesDescriptor(); + attrDesc->setLocation(location()); + + scope().push(attrDesc); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainAttributesHandler{handlerData}; + } +}; + +class DomainFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + FieldDescriptor::FieldType type; + if (args["isSubtree"].asBool()) { + type = FieldDescriptor::FieldType::SUBTREE; + } else { + type = FieldDescriptor::FieldType::TREE; + } + + Rooted parent = scope().selectOrThrow(); + + Rooted field = parent->createFieldDescriptor( + type, args["name"].asString(), args["optional"].asBool()); + field->setLocation(location()); + + scope().push(field); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainFieldHandler{handlerData}; + } +}; + +class DomainFieldRefHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parent = scope().selectOrThrow(); + + const std::string &name = args["name"].asString(); + scope().resolve( + name, parent, logger(), + [](Handle field, Handle parent, Logger &logger) { + if (field != nullptr) { + parent.cast()->addFieldDescriptor( + field.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainFieldRefHandler{handlerData}; + } +}; + +class DomainPrimitiveHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parent = scope().selectOrThrow(); + + Rooted field = parent->createPrimitiveFieldDescriptor( + nullptr, args["name"].asString(), args["optional"].asBool()); + field->setLocation(location()); + + const std::string &type = args["type"].asString(); + scope().resolve( + type, field, logger(), + [](Handle type, Handle field, Logger &logger) { + if (type != nullptr) { + field.cast()->setPrimitiveType( + type.cast()); + } + }); + + scope().push(field); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainPrimitiveHandler{handlerData}; + } +}; + +class DomainChildHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted field = + scope().selectOrThrow(); + + const std::string &ref = args["ref"].asString(); + scope().resolve( + ref, field, logger(), + [](Handle child, Handle field, Logger &logger) { + if (child != nullptr) { + field.cast()->addChild( + child.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainChildHandler{handlerData}; + } +}; + +class DomainParent : public Node { +public: + DomainParent(Manager &mgr, std::string name, Handle parent) + : Node(mgr, name, parent) + { + } +}; + +namespace RttiTypes { +const Rtti DomainParent = + RttiBuilder("DomainParent").parent(&Node); +} + +class DomainParentHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted strct = + scope().selectOrThrow(); + + Rooted parent{new DomainParent( + strct->getManager(), args["name"].asString(), strct)}; + parent->setLocation(location()); + scope().push(parent); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentHandler{handlerData}; + } +}; + +class DomainParentFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parentNameNode = + scope().selectOrThrow(); + FieldDescriptor::FieldType type; + if (args["isSubtree"].asBool()) { + type = FieldDescriptor::FieldType::SUBTREE; + } else { + type = FieldDescriptor::FieldType::TREE; + } + + const std::string &name = args["name"].asString(); + const bool optional = args["optional"].asBool(); + Rooted strct = + parentNameNode->getParent().cast(); + + // resolve the parent, create the declared field and add the declared + // StructuredClass as child to it. + scope().resolve( + parentNameNode->getName(), strct, logger(), + [type, name, optional](Handle parent, Handle strct, + Logger &logger) { + if (parent != nullptr) { + Rooted field = + parent.cast()->createFieldDescriptor( + type, name, optional); + field->addChild(strct.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentFieldHandler{handlerData}; + } +}; + +class DomainParentFieldRefHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parentNameNode = + scope().selectOrThrow(); + + const std::string &name = args["name"].asString(); + Rooted strct = + parentNameNode->getParent().cast(); + auto loc = location(); + + // resolve the parent, get the referenced field and add the declared + // StructuredClass as child to it. + scope().resolve(parentNameNode->getName(), strct, logger(), + [name, loc](Handle parent, + Handle strct, + Logger &logger) { + if (parent != nullptr) { + auto res = parent.cast()->resolve( + &RttiTypes::FieldDescriptor, name); + if (res.size() != 1) { + logger.error( + std::string("Could not find referenced field ") + name, + loc); + return; + } + Rooted field = + res[0].node.cast(); + field->addChild(strct.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentFieldRefHandler{handlerData}; + } +}; + +/* + * Import and Include Handler + */ + +class ImportIncludeHandler : public Handler { +public: + using Handler::Handler; + + bool srcInArgs = false; + std::string rel; + std::string type; + std::string src; + + void start(Variant::mapType &args) override + { + rel = args["rel"].asString(); + type = args["type"].asString(); + src = args["src"].asString(); + srcInArgs = !src.empty(); + } + + void data(const std::string &data, int field) override + { + if (srcInArgs) { + logger().error("\"src\" attribute has already been set"); + return; + } + if (field != 0) { + logger().error("Command has only one field."); + return; + } + src.append(data); + } +}; + +class ImportHandler : public ImportIncludeHandler { +public: + using ImportIncludeHandler::ImportIncludeHandler; + + void start(Variant::mapType &args) override + { + ImportIncludeHandler::start(args); + + // Make sure imports are still possible + if (scope().getFlag(ParserFlag::POST_HEAD)) { + logger().error("Imports must be listed before other commands.", + location()); + return; + } + } + + void end() override + { + // Fetch the last node and check whether an import is valid at this + // position + Rooted leaf = scope().getLeaf(); + if (leaf == nullptr || !leaf->isa(&RttiTypes::RootNode)) { + logger().error( + "Import not supported here, must be inside a document, domain " + "or typesystem command.", + location()); + return; + } + Rooted leafRootNode = leaf.cast(); + + // Perform the actual import, register the imported node within the leaf + // node + Rooted imported = + context().import(src, type, rel, leafRootNode->getReferenceTypes()); + if (imported != nullptr) { + leafRootNode->reference(imported); + } + } + + static Handler *create(const HandlerData &handlerData) + { + return new ImportHandler{handlerData}; + } +}; + +class IncludeHandler : public ImportIncludeHandler { +public: + using ImportIncludeHandler::ImportIncludeHandler; + + void start(Variant::mapType &args) override + { + ImportIncludeHandler::start(args); + } + + void end() override + { + context().include(src, type, rel, {&RttiTypes::Node}); + } + + static Handler *create(const HandlerData &handlerData) + { + return new IncludeHandler{handlerData}; + } +}; + +namespace ParserStates { +/* Document states */ +static const ParserState Document = + ParserStateBuilder() + .parent(&None) + .createdNodeType(&RttiTypes::Document) + .elementHandler(DocumentHandler::create) + .arguments({Argument::String("name", "")}); + +static const ParserState DocumentChild = + ParserStateBuilder() + .parents({&Document, &DocumentChild}) + .createdNodeTypes({&RttiTypes::StructureNode, + &RttiTypes::AnnotationEntity, + &RttiTypes::DocumentField}) + .elementHandler(DocumentChildHandler::create); + +/* Domain states */ +static const ParserState Domain = ParserStateBuilder() + .parents({&None, &Document}) + .createdNodeType(&RttiTypes::Domain) + .elementHandler(DomainHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainStruct = + ParserStateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::StructuredClass) + .elementHandler(DomainStructHandler::create) + .arguments({Argument::String("name"), + Argument::Cardinality("cardinality", Cardinality::any()), + Argument::Bool("isRoot", false), + Argument::Bool("transparent", false), + Argument::String("isa", "")}); + +static const ParserState DomainAnnotation = + ParserStateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::AnnotationClass) + .elementHandler(DomainAnnotationHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainAttributes = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(DomainAttributesHandler::create) + .arguments({}); + +static const ParserState DomainAttribute = + ParserStateBuilder() + .parent(&DomainAttributes) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +static const ParserState DomainField = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +static const ParserState DomainFieldRef = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldRefHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); + +static const ParserState DomainStructPrimitive = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainPrimitiveHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("optional", false), + Argument::String("type")}); + +static const ParserState DomainStructChild = + ParserStateBuilder() + .parent(&DomainField) + .elementHandler(DomainChildHandler::create) + .arguments({Argument::String("ref")}); + +static const ParserState DomainStructParent = + ParserStateBuilder() + .parent(&DomainStruct) + .createdNodeType(&RttiTypes::DomainParent) + .elementHandler(DomainParentHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainStructParentField = + ParserStateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +static const ParserState DomainStructParentFieldRef = + ParserStateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldRefHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); + +/* Typesystem states */ +static const ParserState Typesystem = + ParserStateBuilder() + .parents({&None, &Domain}) + .createdNodeType(&RttiTypes::Typesystem) + .elementHandler(TypesystemHandler::create) + .arguments({Argument::String("name", "")}); + +static const ParserState TypesystemEnum = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::EnumType) + .elementHandler(TypesystemEnumHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState TypesystemEnumEntry = + ParserStateBuilder() + .parent(&TypesystemEnum) + .elementHandler(TypesystemEnumEntryHandler::create) + .arguments({}); + +static const ParserState TypesystemStruct = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(TypesystemStructHandler::create) + .arguments({Argument::String("name"), Argument::String("parent", "")}); + +static const ParserState TypesystemStructField = + ParserStateBuilder() + .parent(&TypesystemStruct) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +static const ParserState TypesystemConstant = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::Constant) + .elementHandler(TypesystemConstantHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("value")}); + +/* Special states for import and include */ +static const ParserState Import = + ParserStateBuilder() + .parents({&Document, &Typesystem, &Domain}) + .elementHandler(ImportHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); + +static const ParserState Include = + ParserStateBuilder() + .parent(&All) + .elementHandler(IncludeHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); + +static const std::multimap XmlStates{ + {"document", &Document}, + {"*", &DocumentChild}, + {"domain", &Domain}, + {"struct", &DomainStruct}, + {"annotation", &DomainAnnotation}, + {"attributes", &DomainAttributes}, + {"attribute", &DomainAttribute}, + {"field", &DomainField}, + {"fieldRef", &DomainFieldRef}, + {"primitive", &DomainStructPrimitive}, + {"child", &DomainStructChild}, + {"parent", &DomainStructParent}, + {"field", &DomainStructParentField}, + {"fieldRef", &DomainStructParentFieldRef}, + {"typesystem", &Typesystem}, + {"enum", &TypesystemEnum}, + {"entry", &TypesystemEnumEntry}, + {"struct", &TypesystemStruct}, + {"field", &TypesystemStructField}, + {"constant", &TypesystemConstant}, + {"import", &Import}, + {"include", &Include}}; +} + +/** + * Structue containing the private data that is being passed to the + * XML-Handlers. + */ +struct XMLUserData { + /** + * Containing the depth of the current XML file + */ + size_t depth; + + /** + * Reference at the ParserStack instance. + */ + ParserStack *stack; + + /** + * Reference at the CharReader instance. + */ + CharReader *reader; + + /** + * Constructor of the XMLUserData struct. + * + * @param stack is a pointer at the ParserStack instance. + * @param reader is a pointer at the CharReader instance. + */ + XMLUserData(ParserStack *stack, CharReader *reader) + : depth(0), stack(stack), reader(reader) + { + } +}; + +/** + * Wrapper class around the XML_Parser pointer which safely frees it whenever + * the scope is left (e.g. because an exception was thrown). + */ +class ScopedExpatXmlParser { +private: + /** + * Internal pointer to the XML_Parser instance. + */ + XML_Parser parser; + +public: + /** + * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS + * from the expat library. Throws a parser exception if the XML parser + * cannot be initialized. + * + * @param encoding is the protocol-defined encoding passed to expat (or + * nullptr if expat should determine the encoding by itself). + */ + ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) + { + parser = XML_ParserCreate(encoding); + if (!parser) { + throw LoggableException{ + "Internal error: Could not create expat XML parser!"}; + } + } + + /** + * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. + */ + ~ScopedExpatXmlParser() + { + if (parser) { + XML_ParserFree(parser); + parser = nullptr; + } + } + + /** + * Returns the XML_Parser pointer. + */ + XML_Parser operator&() { return parser; } +}; + +/* Adapter Expat -> ParserStack */ + +static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) +{ + // Fetch the parser stack and the associated user data + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + // Fetch the current location in the XML file + size_t offs = XML_GetCurrentByteIndex(p); + + // Build the source location and update the default location of the + // current + // logger instance + SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; + stack->getContext().getLogger().setDefaultLocation(loc); + return loc; +} + +enum class XMLAttributeState { + IN_TAG_NAME, + SEARCH_ATTR, + IN_ATTR_NAME, + HAS_ATTR_NAME, + HAS_ATTR_EQUALS, + IN_ATTR_DATA +}; + +static std::map reconstructXMLAttributeOffsets( + CharReader &reader, SourceLocation location) +{ + std::map res; + + // Fork the reader, we don't want to mess up the XML parsing process, do we? + CharReaderFork readerFork = reader.fork(); + + // Move the read cursor to the start location, abort if this does not work + size_t offs = location.getStart(); + if (!location.isValid() || offs != readerFork.seek(offs)) { + return res; + } + + // Now all we need to do is to implement one half of an XML parser. As this + // is inherently complicated we'll totaly fail at it. Don't care. All we + // want to get is those darn offsets for pretty error messages... (and we + // can assume the XML is valid as it was already read by expat) + XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; + char c; + std::stringstream attrName; + while (readerFork.read(c)) { + // Abort at the end of the tag + if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { + return res; + } + + // One state machine to rule them all, one state machine to find them, + // One state machine to bring them all and in the darkness bind them + // (the byte offsets) + switch (state) { + case XMLAttributeState::IN_TAG_NAME: + if (Utils::isWhitespace(c)) { + state = XMLAttributeState::SEARCH_ATTR; + } + break; + case XMLAttributeState::SEARCH_ATTR: + if (!Utils::isWhitespace(c)) { + state = XMLAttributeState::IN_ATTR_NAME; + attrName << c; + } + break; + case XMLAttributeState::IN_ATTR_NAME: + if (Utils::isWhitespace(c)) { + state = XMLAttributeState::HAS_ATTR_NAME; + } else if (c == '=') { + state = XMLAttributeState::HAS_ATTR_EQUALS; + } else { + attrName << c; + } + break; + case XMLAttributeState::HAS_ATTR_NAME: + if (!Utils::isWhitespace(c)) { + if (c == '=') { + state = XMLAttributeState::HAS_ATTR_EQUALS; + break; + } + // Well, this is a strange XML file... We expected to + // see a '=' here! Try to continue with the + // "HAS_ATTR_EQUALS" state as this state will hopefully + // inlcude some error recovery + } else { + // Skip whitespace here + break; + } + // Fallthrough + case XMLAttributeState::HAS_ATTR_EQUALS: + if (!Utils::isWhitespace(c)) { + if (c == '"') { + // Here we are! We have found the beginning of an + // attribute. Let's quickly lock the current offset away + // in the result map + res.emplace(attrName.str(), + SourceLocation{reader.getSourceId(), + readerFork.getOffset()}); + attrName.str(std::string{}); + state = XMLAttributeState::IN_ATTR_DATA; + } else { + // No, this XML file is not well formed. Assume we're in + // an attribute name once again + attrName.str(std::string{&c, 1}); + state = XMLAttributeState::IN_ATTR_NAME; + } + } + break; + case XMLAttributeState::IN_ATTR_DATA: + if (c == '"') { + // We're at the end of the attribute data, start anew + state = XMLAttributeState::SEARCH_ATTR; + } + break; + } + } + return res; +} + +static void xmlStartElementHandler(void *p, const XML_Char *name, + const XML_Char **attrs) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + SourceLocation loc = syncLoggerPosition(parser); + + // Read the argument locations -- this is only a stupid and slow hack, + // but it is necessary, as expat doesn't give use the byte offset of the + // arguments. + std::map offs = + reconstructXMLAttributeOffsets(*userData->reader, loc); + + // Assemble the arguments + Variant::mapType args; + + const XML_Char **attr = attrs; + while (*attr) { + // Convert the C string to a std::string + const std::string key{*(attr++)}; + + // Search the location of the key + SourceLocation keyLoc; + auto it = offs.find(key); + if (it != offs.end()) { + keyLoc = it->second; + } + + // Parse the string, pass the location of the key + std::pair value = VariantReader::parseGenericString( + *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), + keyLoc.getStart()); + args.emplace(key, value.second); + } + + // Call the start function + std::string nameStr(name); + if (nameStr != "ousia" || userData->depth > 0) { + stack->start(std::string(name), args, loc); + } + + // Increment the current depth + userData->depth++; +} + +static void xmlEndElementHandler(void *p, const XML_Char *name) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + syncLoggerPosition(parser); + + // Decrement the current depth + userData->depth--; + + // Call the end function + std::string nameStr(name); + if (nameStr != "ousia" || userData->depth > 0) { + stack->end(); + } +} + +static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + size_t ulen = len > 0 ? static_cast(len) : 0; + syncLoggerPosition(parser, ulen); + const std::string data = Utils::trim(std::string{s, ulen}); + if (!data.empty()) { + stack->data(data); + } +} + +/* Class XmlParser */ + +void XmlParser::doParse(CharReader &reader, ParserContext &ctx) +{ + // Create the parser object + ScopedExpatXmlParser p{"UTF-8"}; + + // Create the parser stack instance, if we're starting on a non-empty scope, + // try to deduce the parser state + ParserStack stack(ctx, ParserStates::XmlStates); + if (!ctx.getScope().isEmpty()) { + if (!stack.deduceState()) { + return; + } + } + + // Pass the reference to the ParserStack to the XML handler + XMLUserData data(&stack, &reader); + XML_SetUserData(&p, &data); + XML_UseParserAsHandlerArg(&p); + + // Set the callback functions + XML_SetStartElementHandler(&p, xmlStartElementHandler); + XML_SetEndElementHandler(&p, xmlEndElementHandler); + XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); + + // Feed data into expat while there is data to process + constexpr size_t BUFFER_SIZE = 64 * 1024; + while (true) { + // Fetch a buffer from expat for the input data + char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); + if (!buf) { + throw LoggableException{ + "Internal error: XML parser out of memory!"}; + } + + // Read into the buffer + size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); + + // Parse the data and handle any XML error + if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { + // Fetch the xml parser byte offset + size_t offs = XML_GetCurrentByteIndex(&p); + + // Throw a corresponding exception + XML_Error code = XML_GetErrorCode(&p); + std::string msg = std::string{XML_ErrorString(code)}; + throw LoggableException{"XML: " + msg, + SourceLocation{ctx.getSourceId(), offs}}; + } + + // Abort once there are no more bytes in the stream + if (bytesRead == 0) { + break; + } + } +} +} + diff --git a/src/formats/osdmx/OsdmxParser.hpp b/src/formats/osdmx/OsdmxParser.hpp new file mode 100644 index 0000000..c8b6302 --- /dev/null +++ b/src/formats/osdmx/OsdmxParser.hpp @@ -0,0 +1,55 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file XmlParser.hpp + * + * Contains the parser responsible for reading Ousía XML Documents (extension + * oxd) and Ousía XML Modules (extension oxm). + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_XML_PARSER_HPP_ +#define _OUSIA_XML_PARSER_HPP_ + +#include + +namespace ousia { + +/** + * The XmlParser class implements parsing the various types of Ousía XML + * documents using the expat stream XML parser. + */ +class XmlParser : public Parser { +protected: + /** + * Parses the given input stream as XML file and returns the parsed + * top-level node. + * + * @param reader is the CharReader from which the input should be read. + * @param ctx is a reference to the ParserContext instance that should be + * used. + */ + void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_XML_PARSER_HPP_ */ + -- cgit v1.2.3 From 2b0632764c26728675090c4cd0920f1b7c093ed1 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:46:30 +0100 Subject: Moved textHandlers to whitespace handlers --- src/formats/osdm/DynamicTokenizer.cpp | 251 ++++++---------------------------- src/formats/osdm/DynamicTokenizer.hpp | 23 +--- 2 files changed, 45 insertions(+), 229 deletions(-) (limited to 'src/formats') diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp index f2cfcd1..1fac25a 100644 --- a/src/formats/osdm/DynamicTokenizer.cpp +++ b/src/formats/osdm/DynamicTokenizer.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "DynamicTokenizer.hpp" @@ -102,8 +103,8 @@ public: * @param textLength is the text buffer length of the previous text token. * @param textEnd is the current end location of the previous text token. */ - TokenLookup(const TokenTrie::Node *node, size_t start, - size_t textLength, size_t textEnd) + TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, + size_t textEnd) : node(node), start(start), textLength(textLength), textEnd(textEnd) { } @@ -155,192 +156,29 @@ public: } }; -/* Internal class TextHandlerBase */ - /** - * Base class used for those classes that may be used as TextHandler in the - * DynamicTokenizer::next function. + * Transforms the given token into a text token containing the extracted + * text. + * + * @param handler is the WhitespaceHandler containing the collected data. + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. */ -class TextHandlerBase { -public: - /** - * Start position of the extracted text. - */ - size_t textStart; - - /** - * End position of the extracted text. - */ - size_t textEnd; - - /** - * Buffer containing the extracted text. - */ - std::vector textBuf; - - /** - * Constructor of the TextHandlerBase base class. Initializes the start and - * end position with zeros. - */ - TextHandlerBase() : textStart(0), textEnd(0) {} - - /** - * Transforms the given token into a text token containing the extracted - * text. - * - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ - void buildTextToken(TokenMatch &match, SourceId sourceId) - { - if (match.hasMatch()) { - match.token.content = - std::string{textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, textStart, match.textEnd}; - } else { - match.token.content = std::string{textBuf.data(), textBuf.size()}; - match.token.location = SourceLocation{sourceId, textStart, textEnd}; - } - match.token.type = TextToken; - } - - /** - * Returns true if this whitespace handler has found any text and a text - * token could be emitted. - * - * @return true if the internal data buffer is non-empty. - */ - bool hasText() { return !textBuf.empty(); } -}; - -/* Internal class PreservingTextHandler */ - -/** - * The PreservingTextHandler class preserves all characters unmodified, - * including whitepace characters. - */ -class PreservingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Appends the given character to the internal text buffer, does not - * eliminate whitespace. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - textBuf.push_back(c); - } -}; - -/* Internal class TrimmingTextHandler */ - -/** - * The TrimmingTextHandler class trims all whitespace characters at the begin - * and the end of a text section but leaves all other characters unmodified, - * including whitepace characters. - */ -class TrimmingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Buffer used internally to temporarily store all whitespace characters. - * They are only added to the output buffer if another non-whitespace - * character is reached. - */ - std::vector whitespaceBuf; - - /** - * Appends the given character to the internal text buffer, eliminates - * whitespace characters at the begin and end of the text. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - whitespaceBuf.push_back(c); - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (!whitespaceBuf.empty()) { - textBuf.insert(textBuf.end(), whitespaceBuf.begin(), - whitespaceBuf.end()); - whitespaceBuf.clear(); - } - textBuf.push_back(c); - } -}; - -/* Internal class CollapsingTextHandler */ - -/** - * The CollapsingTextHandler trims characters at the beginning and end of the - * text and reduced multiple whitespace characters to a single blank. - */ -class CollapsingTextHandler : public TextHandlerBase { -public: - using TextHandlerBase::TextHandlerBase; - - /** - * Flag set to true if a whitespace character was reached. - */ - bool hasWhitespace = false; - - /** - * Appends the given character to the internal text buffer, eliminates - * redundant whitespace characters. - * - * @param c is the character that should be appended to the internal buffer. - * @param start is the start byte offset of the given character. - * @param end is the end byte offset of the given character. - */ - void append(char c, size_t start, size_t end) - { - // Handle whitespace characters - if (Utils::isWhitespace(c)) { - if (!textBuf.empty()) { - hasWhitespace = true; - } - return; - } - - // Set the start and end offset correctly - if (textBuf.empty()) { - textStart = start; - } - textEnd = end; - - // Store the character - if (hasWhitespace) { - textBuf.push_back(' '); - hasWhitespace = false; - } - textBuf.push_back(c); +static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, + SourceId sourceId) +{ + if (match.hasMatch()) { + match.token.content = + std::string{handler.textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, handler.textStart, match.textEnd}; + } else { + match.token.content = handler.toString(); + match.token.location = + SourceLocation{sourceId, handler.textStart, handler.textEnd}; } -}; + match.token.type = TextToken; +} } /* Class DynamicTokenizer */ @@ -409,9 +247,8 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) } // If we found text, emit that text - if (textHandler.hasText() && - (!match.hasMatch() || match.textLength > 0)) { - textHandler.buildTextToken(match, sourceId); + if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { + buildTextToken(textHandler, match, sourceId); } // Move the read/peek cursor to the end of the token, abort if an error @@ -436,28 +273,28 @@ bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) return match.hasMatch(); } -bool DynamicTokenizer::read(CharReader &reader,DynamicToken &token) +bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: - return next(reader, token); + return next(reader, token); case WhitespaceMode::TRIM: - return next(reader, token); + return next(reader, token); case WhitespaceMode::COLLAPSE: - return next(reader, token); + return next(reader, token); } return false; } -bool DynamicTokenizer::peek(CharReader &reader,DynamicToken &token) +bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) { switch (whitespaceMode) { case WhitespaceMode::PRESERVE: - return next(reader, token); + return next(reader, token); case WhitespaceMode::TRIM: - return next(reader, token); + return next(reader, token); case WhitespaceMode::COLLAPSE: - return next(reader, token); + return next(reader, token); } return false; } @@ -493,7 +330,7 @@ TokenTypeId DynamicTokenizer::registerToken(const std::string &token) // Try to register the token in the trie -- if this fails, remove it // from the tokens list if (!trie.registerToken(token, type)) { - tokens[type] = std::string(); + tokens[type] = std::string{}; nextTokenTypeId = type; return EmptyToken; } @@ -528,17 +365,17 @@ WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } /* Explicitly instantiate all possible instantiations of the "next" member function */ -template bool DynamicTokenizer::next( +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( +template bool DynamicTokenizer::next( CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader,DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader,DynamicToken &token); } diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp index 0cac2e8..3e5aeb3 100644 --- a/src/formats/osdm/DynamicTokenizer.hpp +++ b/src/formats/osdm/DynamicTokenizer.hpp @@ -33,6 +33,7 @@ #include #include +#include #include "TokenTrie.hpp" @@ -95,28 +96,6 @@ struct DynamicToken { const SourceLocation &getLocation() const { return location; } }; -/** - * Enum specifying the whitespace handling of the DynamicTokenizer class when - * reading non-token text. - */ -enum class WhitespaceMode { - /** - * Preserves all whitespaces as they are found in the source file. - */ - PRESERVE, - - /** - * Trims whitespace at the beginning and the end of the found text. - */ - TRIM, - - /** - * Whitespaces are trimmed and collapsed, multiple whitespace characters - * are replaced by a single space character. - */ - COLLAPSE -}; - /** * The DynamicTokenizer is used to extract tokens and chunks of text from a * CharReader. It allows to register and unregister tokens while parsing and -- cgit v1.2.3 From 65bbbd778f6e0a3668c859b0e22cced7075a726d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:47:11 +0100 Subject: Moved DynamicTokenizer and TokenTrie to parser/utils --- src/core/parser/utils/TokenTrie.cpp | 119 +++++++++ src/core/parser/utils/TokenTrie.hpp | 150 +++++++++++ src/core/parser/utils/Tokenizer.cpp | 381 ++++++++++++++++++++++++++ src/core/parser/utils/Tokenizer.hpp | 231 ++++++++++++++++ src/formats/osdm/DynamicTokenizer.cpp | 381 -------------------------- src/formats/osdm/DynamicTokenizer.hpp | 231 ---------------- src/formats/osdm/TokenTrie.cpp | 119 --------- src/formats/osdm/TokenTrie.hpp | 150 ----------- test/core/parser/utils/TokenTrieTest.cpp | 92 +++++++ test/core/parser/utils/TokenizerTest.cpp | 415 +++++++++++++++++++++++++++++ test/formats/osdm/DynamicTokenizerTest.cpp | 415 ----------------------------- test/formats/osdm/TokenTrieTest.cpp | 92 ------- 12 files changed, 1388 insertions(+), 1388 deletions(-) create mode 100644 src/core/parser/utils/TokenTrie.cpp create mode 100644 src/core/parser/utils/TokenTrie.hpp create mode 100644 src/core/parser/utils/Tokenizer.cpp create mode 100644 src/core/parser/utils/Tokenizer.hpp delete mode 100644 src/formats/osdm/DynamicTokenizer.cpp delete mode 100644 src/formats/osdm/DynamicTokenizer.hpp delete mode 100644 src/formats/osdm/TokenTrie.cpp delete mode 100644 src/formats/osdm/TokenTrie.hpp create mode 100644 test/core/parser/utils/TokenTrieTest.cpp create mode 100644 test/core/parser/utils/TokenizerTest.cpp delete mode 100644 test/formats/osdm/DynamicTokenizerTest.cpp delete mode 100644 test/formats/osdm/TokenTrieTest.cpp (limited to 'src/formats') diff --git a/src/core/parser/utils/TokenTrie.cpp b/src/core/parser/utils/TokenTrie.cpp new file mode 100644 index 0000000..4a0430b --- /dev/null +++ b/src/core/parser/utils/TokenTrie.cpp @@ -0,0 +1,119 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "TokenTrie.hpp" + +namespace ousia { + +/* Class DynamicTokenTree::Node */ + +TokenTrie::Node::Node() : type(EmptyToken) {} + +/* Class DynamicTokenTree */ + +bool TokenTrie::registerToken(const std::string &token, + TokenTypeId type) noexcept +{ + // Abort if the token is empty -- this would taint the root node + if (token.empty()) { + return false; + } + + // Iterate over each character in the given string and insert them as + // (new) nodes + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Insert a new node if this one does not exist + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + it = node->children.emplace(c, std::make_shared()).first; + } + node = it->second.get(); + } + + // If the resulting node already has a type set, we're screwed. + if (node->type != EmptyToken) { + return false; + } + + // Otherwise just set the type to the given type. + node->type = type; + return true; +} + +bool TokenTrie::unregisterToken(const std::string &token) noexcept +{ + // We cannot remove empty tokens as we need to access the fist character + // upfront + if (token.empty()) { + return false; + } + + // First pass -- search the node in the path that can be deleted + Node *subtreeRoot = &root; + char subtreeKey = token[0]; + Node *node = &root; + for (size_t i = 0; i < token.size(); i++) { + // Go to the next node, abort if the tree ends unexpectedly + auto it = node->children.find(token[i]); + if (it == node->children.end()) { + return false; + } + + // Reset the subtree handler if this node has another type + node = it->second.get(); + if ((node->type != EmptyToken || node->children.size() > 1) && + (i + 1 != token.size())) { + subtreeRoot = node; + subtreeKey = token[i + 1]; + } + } + + // If the node type is already EmptyToken, we cannot do anything here + if (node->type == EmptyToken) { + return false; + } + + // If the target node has children, we cannot delete the subtree. Set the + // type to EmptyToken instead + if (!node->children.empty()) { + node->type = EmptyToken; + return true; + } + + // If we end up here, we can safely delete the complete subtree + subtreeRoot->children.erase(subtreeKey); + return true; +} + +TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept +{ + Node const *node = &root; + for (size_t i = 0; i < token.size(); i++) { + const char c = token[i]; + auto it = node->children.find(c); + if (it == node->children.end()) { + return EmptyToken; + } + node = it->second.get(); + } + return node->type; +} +} + diff --git a/src/core/parser/utils/TokenTrie.hpp b/src/core/parser/utils/TokenTrie.hpp new file mode 100644 index 0000000..36c2ffa --- /dev/null +++ b/src/core/parser/utils/TokenTrie.hpp @@ -0,0 +1,150 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file TokenTrie.hpp + * + * Class representing a token trie that can be updated dynamically. + * + * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_TOKEN_TRIE_HPP_ +#define _OUSIA_TOKEN_TRIE_HPP_ + +#include +#include +#include +#include + +namespace ousia { + +/** + * The TokenTypeId is used to give each token type a unique id. + */ +using TokenTypeId = uint32_t; + +/** + * Token which is not a token. + */ +constexpr TokenTypeId EmptyToken = std::numeric_limits::max(); + +/** + * Token which represents a text token. + */ +constexpr TokenTypeId TextToken = std::numeric_limits::max() - 1; + +/** + * The Tokenizer internally uses a TokenTrie to be efficiently able to identify + * the longest consecutive token in the text. This is equivalent to a prefix + * trie. + * + * A token trie is a construct that structures all special tokens a Tokenizer + * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and + * three. Then the token tree would look like this: + * + * \code{*.txt} + * ~ (0) + * / \ + * a (2) b (0) + * | | + * a (0) a (0) + * | | + * b (1) c (0) + * \endcode + * + * Where the number indicates the corresponding token descriptor identifier. + */ +class TokenTrie { +public: + /** + * Structure used to build the node tree. + */ + struct Node { + /** + * Type used for the child map. + */ + using ChildMap = std::unordered_map>; + + /** + * Map from single characters at the corresponding child nodes. + */ + ChildMap children; + + /** + * Reference at the corresponding token descriptor. Set to nullptr if + * no token is attached to this node. + */ + TokenTypeId type; + + /** + * Default constructor, initializes the descriptor with nullptr. + */ + Node(); + }; + +private: + /** + * Root node of the internal token tree. + */ + Node root; + +public: + /** + * Registers a token containing the given string. Returns false if the + * token already exists, true otherwise. + * + * @param token is the character sequence that should be registered as + * token. + * @param type is the descriptor that should be set for this token. + * @return true if the operation is successful, false otherwise. + */ + bool registerToken(const std::string &token, TokenTypeId type) noexcept; + + /** + * Unregisters the token from the token tree. Returns true if the token was + * unregistered successfully, false otherwise. + * + * @param token is the character sequence that should be unregistered. + * @return true if the operation was successful, false otherwise. + */ + bool unregisterToken(const std::string &token) noexcept; + + /** + * Returns true, if the given token exists within the TokenTree. This + * function is mostly thought for debugging and unit testing. + * + * @param token is the character sequence that should be searched. + * @return the attached token descriptor or nullptr if the given token is + * not found. + */ + TokenTypeId hasToken(const std::string &token) const noexcept; + + /** + * Returns a reference at the root node to be used for traversing the token + * tree. + * + * @return a reference at the root node. + */ + const Node *getRoot() const noexcept { return &root; } +}; +} + +#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ + diff --git a/src/core/parser/utils/Tokenizer.cpp b/src/core/parser/utils/Tokenizer.cpp new file mode 100644 index 0000000..1fac25a --- /dev/null +++ b/src/core/parser/utils/Tokenizer.cpp @@ -0,0 +1,381 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include + +#include +#include +#include +#include + +#include "DynamicTokenizer.hpp" + +namespace ousia { + +namespace { + +/* Internal class TokenMatch */ + +/** + * Contains information about a matching token. + */ +struct TokenMatch { + /** + * Token that was matched. + */ + DynamicToken token; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + + /** + * Constructor of the TokenMatch class. + */ + TokenMatch() : textLength(0), textEnd(0) {} + + /** + * Returns true if this TokenMatch instance actually represents a match. + */ + bool hasMatch() { return token.type != EmptyToken; } +}; + +/* Internal class TokenLookup */ + +/** + * The TokenLookup class is used to represent a thread in a running token + * lookup. + */ +class TokenLookup { +private: + /** + * Current node within the token trie. + */ + TokenTrie::Node const *node; + + /** + * Start offset within the source file. + */ + size_t start; + + /** + * Current length of the data within the text handler. The text buffer needs + * to be trimmed to this length if this token matches. + */ + size_t textLength; + + /** + * End location of the current text handler. This location needs to be used + * for the text token that is emitted before the actual token. + */ + size_t textEnd; + +public: + /** + * Constructor of the TokenLookup class. + * + * @param node is the current node. + * @param start is the start position. + * @param textLength is the text buffer length of the previous text token. + * @param textEnd is the current end location of the previous text token. + */ + TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, + size_t textEnd) + : node(node), start(start), textLength(textLength), textEnd(textEnd) + { + } + + /** + * Tries to extend the current path in the token trie with the given + * character. If a complete token is matched, stores this match in the + * tokens list (in case it is longer than any previous token). + * + * @param c is the character that should be appended to the current prefix. + * @param lookups is a list to which new TokeLookup instances are added -- + * which could potentially be expanded in the next iteration. + * @param match is the DynamicToken instance to which the matching token + * should be written. + * @param tokens is a reference at the internal token list of the + * DynamicTokenizer. + * @param end is the end byte offset of the current character. + * @param sourceId is the source if of this file. + */ + void advance(char c, std::vector &lookups, TokenMatch &match, + const std::vector &tokens, SourceOffset end, + SourceId sourceId) + { + // Check whether we can continue the current token path with the given + // character without visiting an already visited node + auto it = node->children.find(c); + if (it == node->children.end()) { + return; + } + + // Check whether the new node represents a complete token a whether it + // is longer than the current token. If yes, replace the current token. + node = it->second.get(); + if (node->type != EmptyToken) { + const std::string &str = tokens[node->type]; + size_t len = str.size(); + if (len > match.token.content.size()) { + match.token = + DynamicToken{node->type, str, {sourceId, start, end}}; + match.textLength = textLength; + match.textEnd = textEnd; + } + } + + // If this state can possibly be advanced, store it in the states list. + if (!node->children.empty()) { + lookups.emplace_back(*this); + } + } +}; + +/** + * Transforms the given token into a text token containing the extracted + * text. + * + * @param handler is the WhitespaceHandler containing the collected data. + * @param token is the output token to which the text should be written. + * @param sourceId is the source id of the underlying file. + */ +static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, + SourceId sourceId) +{ + if (match.hasMatch()) { + match.token.content = + std::string{handler.textBuf.data(), match.textLength}; + match.token.location = + SourceLocation{sourceId, handler.textStart, match.textEnd}; + } else { + match.token.content = handler.toString(); + match.token.location = + SourceLocation{sourceId, handler.textStart, handler.textEnd}; + } + match.token.type = TextToken; +} +} + +/* Class DynamicTokenizer */ + +DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) + : whitespaceMode(whitespaceMode), nextTokenTypeId(0) +{ +} + +template +bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) +{ + // If we're in the read mode, reset the char reader peek position to the + // current read position + if (read) { + reader.resetPeek(); + } + + // Prepare the lookups in the token trie + const TokenTrie::Node *root = trie.getRoot(); + TokenMatch match; + std::vector lookups; + std::vector nextLookups; + + // Instantiate the text handler + TextHandler textHandler; + + // Peek characters from the reader and try to advance the current token tree + // cursor + char c; + size_t charStart = reader.getPeekOffset(); + const SourceId sourceId = reader.getSourceId(); + while (reader.peek(c)) { + const size_t charEnd = reader.getPeekOffset(); + const size_t textLength = textHandler.textBuf.size(); + const size_t textEnd = textHandler.textEnd; + + // If we do not have a match yet, start a new lookup from the root + if (!match.hasMatch()) { + TokenLookup{root, charStart, textLength, textEnd}.advance( + c, nextLookups, match, tokens, charEnd, sourceId); + } + + // Try to advance all other lookups with the new character + for (TokenLookup &lookup : lookups) { + lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); + } + + // We have found a token and there are no more states to advance or the + // text handler has found something -- abort to return the new token + if (match.hasMatch()) { + if ((nextLookups.empty() || textHandler.hasText())) { + break; + } + } else { + // Record all incomming characters + textHandler.append(c, charStart, charEnd); + } + + // Swap the lookups and the nextLookups list + lookups = std::move(nextLookups); + nextLookups.clear(); + + // Advance the offset + charStart = charEnd; + } + + // If we found text, emit that text + if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { + buildTextToken(textHandler, match, sourceId); + } + + // Move the read/peek cursor to the end of the token, abort if an error + // happens while doing so + if (match.hasMatch()) { + // Make sure we have a valid location + if (match.token.location.getEnd() == InvalidSourceOffset) { + throw OusiaException{"Token end position offset out of range"}; + } + + // Seek to the end of the current token + const size_t end = match.token.location.getEnd(); + if (read) { + reader.seek(end); + } else { + reader.seekPeekCursor(end); + } + token = match.token; + } else { + token = DynamicToken{}; + } + return match.hasMatch(); +} + +bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(reader, token); + case WhitespaceMode::TRIM: + return next(reader, token); + case WhitespaceMode::COLLAPSE: + return next(reader, token); + } + return false; +} + +bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) +{ + switch (whitespaceMode) { + case WhitespaceMode::PRESERVE: + return next(reader, token); + case WhitespaceMode::TRIM: + return next(reader, token); + case WhitespaceMode::COLLAPSE: + return next(reader, token); + } + return false; +} + +TokenTypeId DynamicTokenizer::registerToken(const std::string &token) +{ + // Abort if an empty token should be registered + if (token.empty()) { + return EmptyToken; + } + + // Search for a new slot in the tokens list + TokenTypeId type = EmptyToken; + for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { + if (tokens[i].empty()) { + tokens[i] = token; + type = i; + break; + } + } + + // No existing slot was found, add a new one -- make sure we do not + // override the special token type handles + if (type == EmptyToken) { + type = tokens.size(); + if (type == TextToken || type == EmptyToken) { + throw OusiaException{"Token type ids depleted!"}; + } + tokens.emplace_back(token); + } + nextTokenTypeId = type + 1; + + // Try to register the token in the trie -- if this fails, remove it + // from the tokens list + if (!trie.registerToken(token, type)) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return EmptyToken; + } + return type; +} + +bool DynamicTokenizer::unregisterToken(TokenTypeId type) +{ + // Unregister the token from the trie, abort if an invalid type is given + if (type < tokens.size() && trie.unregisterToken(tokens[type])) { + tokens[type] = std::string{}; + nextTokenTypeId = type; + return true; + } + return false; +} + +std::string DynamicTokenizer::getTokenString(TokenTypeId type) +{ + if (type < tokens.size()) { + return tokens[type]; + } + return std::string{}; +} + +void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) +{ + whitespaceMode = mode; +} + +WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } + +/* Explicitly instantiate all possible instantiations of the "next" member + function */ +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +template bool DynamicTokenizer::next( + CharReader &reader, DynamicToken &token); +} + diff --git a/src/core/parser/utils/Tokenizer.hpp b/src/core/parser/utils/Tokenizer.hpp new file mode 100644 index 0000000..3e5aeb3 --- /dev/null +++ b/src/core/parser/utils/Tokenizer.hpp @@ -0,0 +1,231 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file DynamicTokenizer.hpp + * + * Tokenizer that can be reconfigured at runtime used for parsing the plain + * text format. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ +#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ + +#include +#include +#include + +#include +#include + +#include "TokenTrie.hpp" + +namespace ousia { + +// Forward declarations +class CharReader; + +/** + * The DynamicToken structure describes a token discovered by the Tokenizer. + */ +struct DynamicToken { + /** + * Id of the type of this token. + */ + TokenTypeId type; + + /** + * String that was matched. + */ + std::string content; + + /** + * Location from which the string was extracted. + */ + SourceLocation location; + + /** + * Default constructor. + */ + DynamicToken() : type(EmptyToken) {} + + /** + * Constructor of the DynamicToken struct. + * + * @param id represents the token type. + * @param content is the string content that has been extracted. + * @param location is the location of the extracted string content in the + * source file. + */ + DynamicToken(TokenTypeId type, const std::string &content, + SourceLocation location) + : type(type), content(content), location(location) + { + } + + /** + * Constructor of the DynamicToken struct, only initializes the token type + * + * @param type is the id corresponding to the type of the token. + */ + DynamicToken(TokenTypeId type) : type(type) {} + + /** + * The getLocation function allows the tokens to be directly passed as + * parameter to Logger or LoggableException instances. + * + * @return a reference at the location field + */ + const SourceLocation &getLocation() const { return location; } +}; + +/** + * The DynamicTokenizer is used to extract tokens and chunks of text from a + * CharReader. It allows to register and unregister tokens while parsing and + * to modify the handling of whitespace characters. Note that the + * DynamicTokenizer always tries to extract the longest possible token from the + * tokenizer. + */ +class DynamicTokenizer { +private: + /** + * Internally used token trie. This object holds all registered tokens. + */ + TokenTrie trie; + + /** + * Flag defining whether whitespaces should be preserved or not. + */ + WhitespaceMode whitespaceMode; + + /** + * Vector containing all registered token types. + */ + std::vector tokens; + + /** + * Next index in the tokens list where to search for a new token id. + */ + size_t nextTokenTypeId; + + /** + * Templated function used internally to read the current token. The + * function is templated in order to force code generation for all six + * combiations of whitespace modes and reading/peeking. + * + * @tparam TextHandler is the type to be used for the textHandler instance. + * @tparam read specifies whether the function should start from and advance + * the read pointer of the char reader. + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is the token structure into which the token information + * should be written. + * @return false if the end of the stream has been reached, true otherwise. + */ + template + bool next(CharReader &reader, DynamicToken &token); + +public: + /** + * Constructor of the DynamicTokenizer class. + * + * @param whitespaceMode specifies how whitespace should be handled. + */ + DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); + + /** + * Registers the given string as a token. Returns a const pointer at a + * TokenDescriptor that will be used to reference the newly created token. + * + * @param token is the token string that should be registered. + * @return a unique identifier for the registered token or EmptyToken if + * an error occured. + */ + TokenTypeId registerToken(const std::string &token); + + /** + * Unregisters the token belonging to the given TokenTypeId. + * + * @param type is the token type that should be unregistered. The + *TokenTypeId + * must have been returned by registerToken. + * @return true if the operation was successful, false otherwise (e.g. + * because the given TokenDescriptor was already unregistered). + */ + bool unregisterToken(TokenTypeId type); + + /** + * Returns the token that was registered under the given TokenTypeId id or + *an + * empty string if an invalid TokenTypeId id is given. + * + * @param type is the TokenTypeId id for which the corresponding token + *string + * should be returned. + * @return the registered token string or an empty string if the given type + * was invalid. + */ + std::string getTokenString(TokenTypeId type); + + /** + * Sets the whitespace mode. + * + * @param whitespaceMode defines how whitespace should be treated in text + * tokens. + */ + void setWhitespaceMode(WhitespaceMode mode); + + /** + * Returns the current value of the whitespace mode. + * + * @return the whitespace mode. + */ + WhitespaceMode getWhitespaceMode(); + + /** + * Reads a new token from the CharReader and stores it in the given + * DynamicToken instance. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool read(CharReader &reader, DynamicToken &token); + + /** + * The peek method does not advance the read position of the char reader, + * but reads the next token from the current char reader peek position. + * + * @param reader is the CharReader instance from which the data should be + * read. + * @param token is a reference at the token instance into which the Token + * information should be written. + * @return true if a token could be read, false if the end of the stream + * has been reached. + */ + bool peek(CharReader &reader, DynamicToken &token); +}; +} + +#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ + diff --git a/src/formats/osdm/DynamicTokenizer.cpp b/src/formats/osdm/DynamicTokenizer.cpp deleted file mode 100644 index 1fac25a..0000000 --- a/src/formats/osdm/DynamicTokenizer.cpp +++ /dev/null @@ -1,381 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include - -#include -#include -#include -#include - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -namespace { - -/* Internal class TokenMatch */ - -/** - * Contains information about a matching token. - */ -struct TokenMatch { - /** - * Token that was matched. - */ - DynamicToken token; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - - /** - * Constructor of the TokenMatch class. - */ - TokenMatch() : textLength(0), textEnd(0) {} - - /** - * Returns true if this TokenMatch instance actually represents a match. - */ - bool hasMatch() { return token.type != EmptyToken; } -}; - -/* Internal class TokenLookup */ - -/** - * The TokenLookup class is used to represent a thread in a running token - * lookup. - */ -class TokenLookup { -private: - /** - * Current node within the token trie. - */ - TokenTrie::Node const *node; - - /** - * Start offset within the source file. - */ - size_t start; - - /** - * Current length of the data within the text handler. The text buffer needs - * to be trimmed to this length if this token matches. - */ - size_t textLength; - - /** - * End location of the current text handler. This location needs to be used - * for the text token that is emitted before the actual token. - */ - size_t textEnd; - -public: - /** - * Constructor of the TokenLookup class. - * - * @param node is the current node. - * @param start is the start position. - * @param textLength is the text buffer length of the previous text token. - * @param textEnd is the current end location of the previous text token. - */ - TokenLookup(const TokenTrie::Node *node, size_t start, size_t textLength, - size_t textEnd) - : node(node), start(start), textLength(textLength), textEnd(textEnd) - { - } - - /** - * Tries to extend the current path in the token trie with the given - * character. If a complete token is matched, stores this match in the - * tokens list (in case it is longer than any previous token). - * - * @param c is the character that should be appended to the current prefix. - * @param lookups is a list to which new TokeLookup instances are added -- - * which could potentially be expanded in the next iteration. - * @param match is the DynamicToken instance to which the matching token - * should be written. - * @param tokens is a reference at the internal token list of the - * DynamicTokenizer. - * @param end is the end byte offset of the current character. - * @param sourceId is the source if of this file. - */ - void advance(char c, std::vector &lookups, TokenMatch &match, - const std::vector &tokens, SourceOffset end, - SourceId sourceId) - { - // Check whether we can continue the current token path with the given - // character without visiting an already visited node - auto it = node->children.find(c); - if (it == node->children.end()) { - return; - } - - // Check whether the new node represents a complete token a whether it - // is longer than the current token. If yes, replace the current token. - node = it->second.get(); - if (node->type != EmptyToken) { - const std::string &str = tokens[node->type]; - size_t len = str.size(); - if (len > match.token.content.size()) { - match.token = - DynamicToken{node->type, str, {sourceId, start, end}}; - match.textLength = textLength; - match.textEnd = textEnd; - } - } - - // If this state can possibly be advanced, store it in the states list. - if (!node->children.empty()) { - lookups.emplace_back(*this); - } - } -}; - -/** - * Transforms the given token into a text token containing the extracted - * text. - * - * @param handler is the WhitespaceHandler containing the collected data. - * @param token is the output token to which the text should be written. - * @param sourceId is the source id of the underlying file. - */ -static void buildTextToken(const WhitespaceHandler &handler, TokenMatch &match, - SourceId sourceId) -{ - if (match.hasMatch()) { - match.token.content = - std::string{handler.textBuf.data(), match.textLength}; - match.token.location = - SourceLocation{sourceId, handler.textStart, match.textEnd}; - } else { - match.token.content = handler.toString(); - match.token.location = - SourceLocation{sourceId, handler.textStart, handler.textEnd}; - } - match.token.type = TextToken; -} -} - -/* Class DynamicTokenizer */ - -DynamicTokenizer::DynamicTokenizer(WhitespaceMode whitespaceMode) - : whitespaceMode(whitespaceMode), nextTokenTypeId(0) -{ -} - -template -bool DynamicTokenizer::next(CharReader &reader, DynamicToken &token) -{ - // If we're in the read mode, reset the char reader peek position to the - // current read position - if (read) { - reader.resetPeek(); - } - - // Prepare the lookups in the token trie - const TokenTrie::Node *root = trie.getRoot(); - TokenMatch match; - std::vector lookups; - std::vector nextLookups; - - // Instantiate the text handler - TextHandler textHandler; - - // Peek characters from the reader and try to advance the current token tree - // cursor - char c; - size_t charStart = reader.getPeekOffset(); - const SourceId sourceId = reader.getSourceId(); - while (reader.peek(c)) { - const size_t charEnd = reader.getPeekOffset(); - const size_t textLength = textHandler.textBuf.size(); - const size_t textEnd = textHandler.textEnd; - - // If we do not have a match yet, start a new lookup from the root - if (!match.hasMatch()) { - TokenLookup{root, charStart, textLength, textEnd}.advance( - c, nextLookups, match, tokens, charEnd, sourceId); - } - - // Try to advance all other lookups with the new character - for (TokenLookup &lookup : lookups) { - lookup.advance(c, nextLookups, match, tokens, charEnd, sourceId); - } - - // We have found a token and there are no more states to advance or the - // text handler has found something -- abort to return the new token - if (match.hasMatch()) { - if ((nextLookups.empty() || textHandler.hasText())) { - break; - } - } else { - // Record all incomming characters - textHandler.append(c, charStart, charEnd); - } - - // Swap the lookups and the nextLookups list - lookups = std::move(nextLookups); - nextLookups.clear(); - - // Advance the offset - charStart = charEnd; - } - - // If we found text, emit that text - if (textHandler.hasText() && (!match.hasMatch() || match.textLength > 0)) { - buildTextToken(textHandler, match, sourceId); - } - - // Move the read/peek cursor to the end of the token, abort if an error - // happens while doing so - if (match.hasMatch()) { - // Make sure we have a valid location - if (match.token.location.getEnd() == InvalidSourceOffset) { - throw OusiaException{"Token end position offset out of range"}; - } - - // Seek to the end of the current token - const size_t end = match.token.location.getEnd(); - if (read) { - reader.seek(end); - } else { - reader.seekPeekCursor(end); - } - token = match.token; - } else { - token = DynamicToken{}; - } - return match.hasMatch(); -} - -bool DynamicTokenizer::read(CharReader &reader, DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; -} - -bool DynamicTokenizer::peek(CharReader &reader, DynamicToken &token) -{ - switch (whitespaceMode) { - case WhitespaceMode::PRESERVE: - return next(reader, token); - case WhitespaceMode::TRIM: - return next(reader, token); - case WhitespaceMode::COLLAPSE: - return next(reader, token); - } - return false; -} - -TokenTypeId DynamicTokenizer::registerToken(const std::string &token) -{ - // Abort if an empty token should be registered - if (token.empty()) { - return EmptyToken; - } - - // Search for a new slot in the tokens list - TokenTypeId type = EmptyToken; - for (size_t i = nextTokenTypeId; i < tokens.size(); i++) { - if (tokens[i].empty()) { - tokens[i] = token; - type = i; - break; - } - } - - // No existing slot was found, add a new one -- make sure we do not - // override the special token type handles - if (type == EmptyToken) { - type = tokens.size(); - if (type == TextToken || type == EmptyToken) { - throw OusiaException{"Token type ids depleted!"}; - } - tokens.emplace_back(token); - } - nextTokenTypeId = type + 1; - - // Try to register the token in the trie -- if this fails, remove it - // from the tokens list - if (!trie.registerToken(token, type)) { - tokens[type] = std::string{}; - nextTokenTypeId = type; - return EmptyToken; - } - return type; -} - -bool DynamicTokenizer::unregisterToken(TokenTypeId type) -{ - // Unregister the token from the trie, abort if an invalid type is given - if (type < tokens.size() && trie.unregisterToken(tokens[type])) { - tokens[type] = std::string{}; - nextTokenTypeId = type; - return true; - } - return false; -} - -std::string DynamicTokenizer::getTokenString(TokenTypeId type) -{ - if (type < tokens.size()) { - return tokens[type]; - } - return std::string{}; -} - -void DynamicTokenizer::setWhitespaceMode(WhitespaceMode mode) -{ - whitespaceMode = mode; -} - -WhitespaceMode DynamicTokenizer::getWhitespaceMode() { return whitespaceMode; } - -/* Explicitly instantiate all possible instantiations of the "next" member - function */ -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -template bool DynamicTokenizer::next( - CharReader &reader, DynamicToken &token); -} - diff --git a/src/formats/osdm/DynamicTokenizer.hpp b/src/formats/osdm/DynamicTokenizer.hpp deleted file mode 100644 index 3e5aeb3..0000000 --- a/src/formats/osdm/DynamicTokenizer.hpp +++ /dev/null @@ -1,231 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file DynamicTokenizer.hpp - * - * Tokenizer that can be reconfigured at runtime used for parsing the plain - * text format. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_DYNAMIC_TOKENIZER_HPP_ -#define _OUSIA_DYNAMIC_TOKENIZER_HPP_ - -#include -#include -#include - -#include -#include - -#include "TokenTrie.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; - -/** - * The DynamicToken structure describes a token discovered by the Tokenizer. - */ -struct DynamicToken { - /** - * Id of the type of this token. - */ - TokenTypeId type; - - /** - * String that was matched. - */ - std::string content; - - /** - * Location from which the string was extracted. - */ - SourceLocation location; - - /** - * Default constructor. - */ - DynamicToken() : type(EmptyToken) {} - - /** - * Constructor of the DynamicToken struct. - * - * @param id represents the token type. - * @param content is the string content that has been extracted. - * @param location is the location of the extracted string content in the - * source file. - */ - DynamicToken(TokenTypeId type, const std::string &content, - SourceLocation location) - : type(type), content(content), location(location) - { - } - - /** - * Constructor of the DynamicToken struct, only initializes the token type - * - * @param type is the id corresponding to the type of the token. - */ - DynamicToken(TokenTypeId type) : type(type) {} - - /** - * The getLocation function allows the tokens to be directly passed as - * parameter to Logger or LoggableException instances. - * - * @return a reference at the location field - */ - const SourceLocation &getLocation() const { return location; } -}; - -/** - * The DynamicTokenizer is used to extract tokens and chunks of text from a - * CharReader. It allows to register and unregister tokens while parsing and - * to modify the handling of whitespace characters. Note that the - * DynamicTokenizer always tries to extract the longest possible token from the - * tokenizer. - */ -class DynamicTokenizer { -private: - /** - * Internally used token trie. This object holds all registered tokens. - */ - TokenTrie trie; - - /** - * Flag defining whether whitespaces should be preserved or not. - */ - WhitespaceMode whitespaceMode; - - /** - * Vector containing all registered token types. - */ - std::vector tokens; - - /** - * Next index in the tokens list where to search for a new token id. - */ - size_t nextTokenTypeId; - - /** - * Templated function used internally to read the current token. The - * function is templated in order to force code generation for all six - * combiations of whitespace modes and reading/peeking. - * - * @tparam TextHandler is the type to be used for the textHandler instance. - * @tparam read specifies whether the function should start from and advance - * the read pointer of the char reader. - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is the token structure into which the token information - * should be written. - * @return false if the end of the stream has been reached, true otherwise. - */ - template - bool next(CharReader &reader, DynamicToken &token); - -public: - /** - * Constructor of the DynamicTokenizer class. - * - * @param whitespaceMode specifies how whitespace should be handled. - */ - DynamicTokenizer(WhitespaceMode whitespaceMode = WhitespaceMode::COLLAPSE); - - /** - * Registers the given string as a token. Returns a const pointer at a - * TokenDescriptor that will be used to reference the newly created token. - * - * @param token is the token string that should be registered. - * @return a unique identifier for the registered token or EmptyToken if - * an error occured. - */ - TokenTypeId registerToken(const std::string &token); - - /** - * Unregisters the token belonging to the given TokenTypeId. - * - * @param type is the token type that should be unregistered. The - *TokenTypeId - * must have been returned by registerToken. - * @return true if the operation was successful, false otherwise (e.g. - * because the given TokenDescriptor was already unregistered). - */ - bool unregisterToken(TokenTypeId type); - - /** - * Returns the token that was registered under the given TokenTypeId id or - *an - * empty string if an invalid TokenTypeId id is given. - * - * @param type is the TokenTypeId id for which the corresponding token - *string - * should be returned. - * @return the registered token string or an empty string if the given type - * was invalid. - */ - std::string getTokenString(TokenTypeId type); - - /** - * Sets the whitespace mode. - * - * @param whitespaceMode defines how whitespace should be treated in text - * tokens. - */ - void setWhitespaceMode(WhitespaceMode mode); - - /** - * Returns the current value of the whitespace mode. - * - * @return the whitespace mode. - */ - WhitespaceMode getWhitespaceMode(); - - /** - * Reads a new token from the CharReader and stores it in the given - * DynamicToken instance. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool read(CharReader &reader, DynamicToken &token); - - /** - * The peek method does not advance the read position of the char reader, - * but reads the next token from the current char reader peek position. - * - * @param reader is the CharReader instance from which the data should be - * read. - * @param token is a reference at the token instance into which the Token - * information should be written. - * @return true if a token could be read, false if the end of the stream - * has been reached. - */ - bool peek(CharReader &reader, DynamicToken &token); -}; -} - -#endif /* _OUSIA_DYNAMIC_TOKENIZER_HPP_ */ - diff --git a/src/formats/osdm/TokenTrie.cpp b/src/formats/osdm/TokenTrie.cpp deleted file mode 100644 index 4a0430b..0000000 --- a/src/formats/osdm/TokenTrie.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "TokenTrie.hpp" - -namespace ousia { - -/* Class DynamicTokenTree::Node */ - -TokenTrie::Node::Node() : type(EmptyToken) {} - -/* Class DynamicTokenTree */ - -bool TokenTrie::registerToken(const std::string &token, - TokenTypeId type) noexcept -{ - // Abort if the token is empty -- this would taint the root node - if (token.empty()) { - return false; - } - - // Iterate over each character in the given string and insert them as - // (new) nodes - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Insert a new node if this one does not exist - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - it = node->children.emplace(c, std::make_shared()).first; - } - node = it->second.get(); - } - - // If the resulting node already has a type set, we're screwed. - if (node->type != EmptyToken) { - return false; - } - - // Otherwise just set the type to the given type. - node->type = type; - return true; -} - -bool TokenTrie::unregisterToken(const std::string &token) noexcept -{ - // We cannot remove empty tokens as we need to access the fist character - // upfront - if (token.empty()) { - return false; - } - - // First pass -- search the node in the path that can be deleted - Node *subtreeRoot = &root; - char subtreeKey = token[0]; - Node *node = &root; - for (size_t i = 0; i < token.size(); i++) { - // Go to the next node, abort if the tree ends unexpectedly - auto it = node->children.find(token[i]); - if (it == node->children.end()) { - return false; - } - - // Reset the subtree handler if this node has another type - node = it->second.get(); - if ((node->type != EmptyToken || node->children.size() > 1) && - (i + 1 != token.size())) { - subtreeRoot = node; - subtreeKey = token[i + 1]; - } - } - - // If the node type is already EmptyToken, we cannot do anything here - if (node->type == EmptyToken) { - return false; - } - - // If the target node has children, we cannot delete the subtree. Set the - // type to EmptyToken instead - if (!node->children.empty()) { - node->type = EmptyToken; - return true; - } - - // If we end up here, we can safely delete the complete subtree - subtreeRoot->children.erase(subtreeKey); - return true; -} - -TokenTypeId TokenTrie::hasToken(const std::string &token) const noexcept -{ - Node const *node = &root; - for (size_t i = 0; i < token.size(); i++) { - const char c = token[i]; - auto it = node->children.find(c); - if (it == node->children.end()) { - return EmptyToken; - } - node = it->second.get(); - } - return node->type; -} -} - diff --git a/src/formats/osdm/TokenTrie.hpp b/src/formats/osdm/TokenTrie.hpp deleted file mode 100644 index 36c2ffa..0000000 --- a/src/formats/osdm/TokenTrie.hpp +++ /dev/null @@ -1,150 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file TokenTrie.hpp - * - * Class representing a token trie that can be updated dynamically. - * - * @author Benjamin Paaßen (astoecke@techfak.uni-bielefeld.de) - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_TOKEN_TRIE_HPP_ -#define _OUSIA_TOKEN_TRIE_HPP_ - -#include -#include -#include -#include - -namespace ousia { - -/** - * The TokenTypeId is used to give each token type a unique id. - */ -using TokenTypeId = uint32_t; - -/** - * Token which is not a token. - */ -constexpr TokenTypeId EmptyToken = std::numeric_limits::max(); - -/** - * Token which represents a text token. - */ -constexpr TokenTypeId TextToken = std::numeric_limits::max() - 1; - -/** - * The Tokenizer internally uses a TokenTrie to be efficiently able to identify - * the longest consecutive token in the text. This is equivalent to a prefix - * trie. - * - * A token trie is a construct that structures all special tokens a Tokenizer - * recognizes. Consider the tokens "aab", "a" and "bac" numbered as one, two and - * three. Then the token tree would look like this: - * - * \code{*.txt} - * ~ (0) - * / \ - * a (2) b (0) - * | | - * a (0) a (0) - * | | - * b (1) c (0) - * \endcode - * - * Where the number indicates the corresponding token descriptor identifier. - */ -class TokenTrie { -public: - /** - * Structure used to build the node tree. - */ - struct Node { - /** - * Type used for the child map. - */ - using ChildMap = std::unordered_map>; - - /** - * Map from single characters at the corresponding child nodes. - */ - ChildMap children; - - /** - * Reference at the corresponding token descriptor. Set to nullptr if - * no token is attached to this node. - */ - TokenTypeId type; - - /** - * Default constructor, initializes the descriptor with nullptr. - */ - Node(); - }; - -private: - /** - * Root node of the internal token tree. - */ - Node root; - -public: - /** - * Registers a token containing the given string. Returns false if the - * token already exists, true otherwise. - * - * @param token is the character sequence that should be registered as - * token. - * @param type is the descriptor that should be set for this token. - * @return true if the operation is successful, false otherwise. - */ - bool registerToken(const std::string &token, TokenTypeId type) noexcept; - - /** - * Unregisters the token from the token tree. Returns true if the token was - * unregistered successfully, false otherwise. - * - * @param token is the character sequence that should be unregistered. - * @return true if the operation was successful, false otherwise. - */ - bool unregisterToken(const std::string &token) noexcept; - - /** - * Returns true, if the given token exists within the TokenTree. This - * function is mostly thought for debugging and unit testing. - * - * @param token is the character sequence that should be searched. - * @return the attached token descriptor or nullptr if the given token is - * not found. - */ - TokenTypeId hasToken(const std::string &token) const noexcept; - - /** - * Returns a reference at the root node to be used for traversing the token - * tree. - * - * @return a reference at the root node. - */ - const Node *getRoot() const noexcept { return &root; } -}; -} - -#endif /* _OUSIA_TOKEN_TRIE_HPP_ */ - diff --git a/test/core/parser/utils/TokenTrieTest.cpp b/test/core/parser/utils/TokenTrieTest.cpp new file mode 100644 index 0000000..aacd6c0 --- /dev/null +++ b/test/core/parser/utils/TokenTrieTest.cpp @@ -0,0 +1,92 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include + +namespace ousia { + +static const TokenTypeId t1 = 0; +static const TokenTypeId t2 = 1; +static const TokenTypeId t3 = 2; +static const TokenTypeId t4 = 3; + +TEST(TokenTrie, registerToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_TRUE(tree.registerToken("hello", t4)); + + ASSERT_FALSE(tree.registerToken("", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + ASSERT_FALSE(tree.registerToken("b", t4)); + ASSERT_FALSE(tree.registerToken("hello", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + ASSERT_EQ(t4, tree.hasToken("hello")); + ASSERT_EQ(EmptyToken, tree.hasToken("")); + ASSERT_EQ(EmptyToken, tree.hasToken("abc")); +} + +TEST(TokenTrie, unregisterToken) +{ + TokenTrie tree; + + ASSERT_TRUE(tree.registerToken("a", t1)); + ASSERT_FALSE(tree.registerToken("a", t4)); + + ASSERT_TRUE(tree.registerToken("ab", t2)); + ASSERT_FALSE(tree.registerToken("ab", t4)); + + ASSERT_TRUE(tree.registerToken("b", t3)); + ASSERT_FALSE(tree.registerToken("b", t4)); + + ASSERT_EQ(t1, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("a")); + ASSERT_FALSE(tree.unregisterToken("a")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(t3, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("b")); + ASSERT_FALSE(tree.unregisterToken("b")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(t2, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); + + ASSERT_TRUE(tree.unregisterToken("ab")); + ASSERT_FALSE(tree.unregisterToken("ab")); + + ASSERT_EQ(EmptyToken, tree.hasToken("a")); + ASSERT_EQ(EmptyToken, tree.hasToken("ab")); + ASSERT_EQ(EmptyToken, tree.hasToken("b")); +} +} + diff --git a/test/core/parser/utils/TokenizerTest.cpp b/test/core/parser/utils/TokenizerTest.cpp new file mode 100644 index 0000000..c1f8785 --- /dev/null +++ b/test/core/parser/utils/TokenizerTest.cpp @@ -0,0 +1,415 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include + +namespace ousia { + +TEST(DynamicTokenizer, tokenRegistration) +{ + DynamicTokenizer tokenizer; + + ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); + + ASSERT_EQ(0U, tokenizer.registerToken("a")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("a")); + ASSERT_EQ("a", tokenizer.getTokenString(0U)); + + ASSERT_EQ(1U, tokenizer.registerToken("b")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("b")); + ASSERT_EQ("b", tokenizer.getTokenString(1U)); + + ASSERT_EQ(2U, tokenizer.registerToken("c")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("c")); + ASSERT_EQ("c", tokenizer.getTokenString(2U)); + + ASSERT_TRUE(tokenizer.unregisterToken(1U)); + ASSERT_FALSE(tokenizer.unregisterToken(1U)); + ASSERT_EQ("", tokenizer.getTokenString(1U)); + + ASSERT_EQ(1U, tokenizer.registerToken("d")); + ASSERT_EQ(EmptyToken, tokenizer.registerToken("d")); + ASSERT_EQ("d", tokenizer.getTokenString(1U)); +} + +TEST(DynamicTokenizer, textTokenPreserveWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ(" this \t is only a \n\n test text ", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(36U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, textTokenTrimWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this \t is only a \n\n test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, textTokenCollapseWhitespace) +{ + { + CharReader reader{" this \t is only a \n\n test text "}; + // 012345 6789012345678 9 0123456789012345 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(33U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } + + { + CharReader reader{"this \t is only a \n\n test text"}; + // 01234 5678901234567 8 9012345678901 + // 0 1 2 3 + DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("this is only a test text", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(32U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); + } +} + +TEST(DynamicTokenizer, simpleReadToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ(':', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + + char c; + ASSERT_TRUE(reader.peek(c)); + ASSERT_EQ('t', c); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + + char c; + ASSERT_FALSE(reader.peek(c)); + } +} + +TEST(DynamicTokenizer, simplePeekToken) +{ + CharReader reader{"test1:test2"}; + DynamicTokenizer tokenizer; + + const TokenTypeId tid = tokenizer.registerToken(":"); + ASSERT_EQ(0U, tid); + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.peek(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(0U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test1", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(5U, loc.getEnd()); + ASSERT_EQ(5U, reader.getOffset()); + ASSERT_EQ(5U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(tid, token.type); + ASSERT_EQ(":", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(5U, loc.getStart()); + ASSERT_EQ(6U, loc.getEnd()); + ASSERT_EQ(6U, reader.getOffset()); + ASSERT_EQ(6U, reader.getPeekOffset()); + } + + { + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("test2", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(6U, loc.getStart()); + ASSERT_EQ(11U, loc.getEnd()); + ASSERT_EQ(11U, reader.getOffset()); + ASSERT_EQ(11U, reader.getPeekOffset()); + } +} + +TEST(DynamicTokenizer, ambiguousTokens) +{ + CharReader reader{"abc"}; + DynamicTokenizer tokenizer; + + TokenTypeId t1 = tokenizer.registerToken("abd"); + TokenTypeId t2 = tokenizer.registerToken("bc"); + + ASSERT_EQ(0U, t1); + ASSERT_EQ(1U, t2); + + DynamicToken token; + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(TextToken, token.type); + ASSERT_EQ("a", token.content); + + SourceLocation loc = token.location; + ASSERT_EQ(0U, loc.getStart()); + ASSERT_EQ(1U, loc.getEnd()); + + ASSERT_TRUE(tokenizer.read(reader, token)); + + ASSERT_EQ(t2, token.type); + ASSERT_EQ("bc", token.content); + + loc = token.location; + ASSERT_EQ(1U, loc.getStart()); + ASSERT_EQ(3U, loc.getEnd()); + + ASSERT_FALSE(tokenizer.read(reader, token)); +} + +TEST(DynamicTokenizer, commentTestWhitespacePreserve) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test ", SourceLocation{0, 5, 10}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(reader, t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(reader, t)); +} + +TEST(DynamicTokenizer, commentTestWhitespaceCollapse) +{ + CharReader reader{"Test/Test /* Block Comment */", 0}; + // 012345678901234567890123456789 + // 0 1 2 + DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); + + const TokenTypeId t1 = tokenizer.registerToken("/"); + const TokenTypeId t2 = tokenizer.registerToken("/*"); + const TokenTypeId t3 = tokenizer.registerToken("*/"); + + std::vector expected = { + {TextToken, "Test", SourceLocation{0, 0, 4}}, + {t1, "/", SourceLocation{0, 4, 5}}, + {TextToken, "Test", SourceLocation{0, 5, 9}}, + {t2, "/*", SourceLocation{0, 10, 12}}, + {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, + {t3, "*/", SourceLocation{0, 27, 29}}}; + + DynamicToken t; + for (auto &te : expected) { + EXPECT_TRUE(tokenizer.read(reader, t)); + EXPECT_EQ(te.type, t.type); + EXPECT_EQ(te.content, t.content); + EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); + EXPECT_EQ(te.location.getStart(), t.location.getStart()); + EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); + } + ASSERT_FALSE(tokenizer.read(reader, t)); +} + +} + diff --git a/test/formats/osdm/DynamicTokenizerTest.cpp b/test/formats/osdm/DynamicTokenizerTest.cpp deleted file mode 100644 index c1f8785..0000000 --- a/test/formats/osdm/DynamicTokenizerTest.cpp +++ /dev/null @@ -1,415 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include -#include - -namespace ousia { - -TEST(DynamicTokenizer, tokenRegistration) -{ - DynamicTokenizer tokenizer; - - ASSERT_EQ(EmptyToken, tokenizer.registerToken("")); - - ASSERT_EQ(0U, tokenizer.registerToken("a")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("a")); - ASSERT_EQ("a", tokenizer.getTokenString(0U)); - - ASSERT_EQ(1U, tokenizer.registerToken("b")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("b")); - ASSERT_EQ("b", tokenizer.getTokenString(1U)); - - ASSERT_EQ(2U, tokenizer.registerToken("c")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("c")); - ASSERT_EQ("c", tokenizer.getTokenString(2U)); - - ASSERT_TRUE(tokenizer.unregisterToken(1U)); - ASSERT_FALSE(tokenizer.unregisterToken(1U)); - ASSERT_EQ("", tokenizer.getTokenString(1U)); - - ASSERT_EQ(1U, tokenizer.registerToken("d")); - ASSERT_EQ(EmptyToken, tokenizer.registerToken("d")); - ASSERT_EQ("d", tokenizer.getTokenString(1U)); -} - -TEST(DynamicTokenizer, textTokenPreserveWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ(" this \t is only a \n\n test text ", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(36U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::PRESERVE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, textTokenTrimWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::TRIM}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this \t is only a \n\n test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, textTokenCollapseWhitespace) -{ - { - CharReader reader{" this \t is only a \n\n test text "}; - // 012345 6789012345678 9 0123456789012345 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this is only a test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(33U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } - - { - CharReader reader{"this \t is only a \n\n test text"}; - // 01234 5678901234567 8 9012345678901 - // 0 1 2 3 - DynamicTokenizer tokenizer{WhitespaceMode::COLLAPSE}; - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("this is only a test text", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(32U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); - } -} - -TEST(DynamicTokenizer, simpleReadToken) -{ - CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; - - const TokenTypeId tid = tokenizer.registerToken(":"); - ASSERT_EQ(0U, tid); - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - char c; - ASSERT_TRUE(reader.peek(c)); - ASSERT_EQ(':', c); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - - char c; - ASSERT_TRUE(reader.peek(c)); - ASSERT_EQ('t', c); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - - char c; - ASSERT_FALSE(reader.peek(c)); - } -} - -TEST(DynamicTokenizer, simplePeekToken) -{ - CharReader reader{"test1:test2"}; - DynamicTokenizer tokenizer; - - const TokenTypeId tid = tokenizer.registerToken(":"); - ASSERT_EQ(0U, tid); - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(5U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(6U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.peek(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - ASSERT_EQ(0U, reader.getOffset()); - ASSERT_EQ(11U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test1", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - ASSERT_EQ(5U, reader.getOffset()); - ASSERT_EQ(5U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(tid, token.type); - ASSERT_EQ(":", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(6U, loc.getEnd()); - ASSERT_EQ(6U, reader.getOffset()); - ASSERT_EQ(6U, reader.getPeekOffset()); - } - - { - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("test2", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(6U, loc.getStart()); - ASSERT_EQ(11U, loc.getEnd()); - ASSERT_EQ(11U, reader.getOffset()); - ASSERT_EQ(11U, reader.getPeekOffset()); - } -} - -TEST(DynamicTokenizer, ambiguousTokens) -{ - CharReader reader{"abc"}; - DynamicTokenizer tokenizer; - - TokenTypeId t1 = tokenizer.registerToken("abd"); - TokenTypeId t2 = tokenizer.registerToken("bc"); - - ASSERT_EQ(0U, t1); - ASSERT_EQ(1U, t2); - - DynamicToken token; - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(TextToken, token.type); - ASSERT_EQ("a", token.content); - - SourceLocation loc = token.location; - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - - ASSERT_TRUE(tokenizer.read(reader, token)); - - ASSERT_EQ(t2, token.type); - ASSERT_EQ("bc", token.content); - - loc = token.location; - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(3U, loc.getEnd()); - - ASSERT_FALSE(tokenizer.read(reader, token)); -} - -TEST(DynamicTokenizer, commentTestWhitespacePreserve) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::PRESERVE); - - const TokenTypeId t1 = tokenizer.registerToken("/"); - const TokenTypeId t2 = tokenizer.registerToken("/*"); - const TokenTypeId t3 = tokenizer.registerToken("*/"); - - std::vector expected = { - {TextToken, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {TextToken, "Test ", SourceLocation{0, 5, 10}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {TextToken, " Block Comment ", SourceLocation{0, 12, 27}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - DynamicToken t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.type, t.type); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -TEST(DynamicTokenizer, commentTestWhitespaceCollapse) -{ - CharReader reader{"Test/Test /* Block Comment */", 0}; - // 012345678901234567890123456789 - // 0 1 2 - DynamicTokenizer tokenizer(WhitespaceMode::COLLAPSE); - - const TokenTypeId t1 = tokenizer.registerToken("/"); - const TokenTypeId t2 = tokenizer.registerToken("/*"); - const TokenTypeId t3 = tokenizer.registerToken("*/"); - - std::vector expected = { - {TextToken, "Test", SourceLocation{0, 0, 4}}, - {t1, "/", SourceLocation{0, 4, 5}}, - {TextToken, "Test", SourceLocation{0, 5, 9}}, - {t2, "/*", SourceLocation{0, 10, 12}}, - {TextToken, "Block Comment", SourceLocation{0, 13, 26}}, - {t3, "*/", SourceLocation{0, 27, 29}}}; - - DynamicToken t; - for (auto &te : expected) { - EXPECT_TRUE(tokenizer.read(reader, t)); - EXPECT_EQ(te.type, t.type); - EXPECT_EQ(te.content, t.content); - EXPECT_EQ(te.location.getSourceId(), t.location.getSourceId()); - EXPECT_EQ(te.location.getStart(), t.location.getStart()); - EXPECT_EQ(te.location.getEnd(), t.location.getEnd()); - } - ASSERT_FALSE(tokenizer.read(reader, t)); -} - -} - diff --git a/test/formats/osdm/TokenTrieTest.cpp b/test/formats/osdm/TokenTrieTest.cpp deleted file mode 100644 index aacd6c0..0000000 --- a/test/formats/osdm/TokenTrieTest.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -namespace ousia { - -static const TokenTypeId t1 = 0; -static const TokenTypeId t2 = 1; -static const TokenTypeId t3 = 2; -static const TokenTypeId t4 = 3; - -TEST(TokenTrie, registerToken) -{ - TokenTrie tree; - - ASSERT_TRUE(tree.registerToken("a", t1)); - ASSERT_TRUE(tree.registerToken("ab", t2)); - ASSERT_TRUE(tree.registerToken("b", t3)); - ASSERT_TRUE(tree.registerToken("hello", t4)); - - ASSERT_FALSE(tree.registerToken("", t1)); - ASSERT_FALSE(tree.registerToken("a", t4)); - ASSERT_FALSE(tree.registerToken("ab", t4)); - ASSERT_FALSE(tree.registerToken("b", t4)); - ASSERT_FALSE(tree.registerToken("hello", t4)); - - ASSERT_EQ(t1, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - ASSERT_EQ(t4, tree.hasToken("hello")); - ASSERT_EQ(EmptyToken, tree.hasToken("")); - ASSERT_EQ(EmptyToken, tree.hasToken("abc")); -} - -TEST(TokenTrie, unregisterToken) -{ - TokenTrie tree; - - ASSERT_TRUE(tree.registerToken("a", t1)); - ASSERT_FALSE(tree.registerToken("a", t4)); - - ASSERT_TRUE(tree.registerToken("ab", t2)); - ASSERT_FALSE(tree.registerToken("ab", t4)); - - ASSERT_TRUE(tree.registerToken("b", t3)); - ASSERT_FALSE(tree.registerToken("b", t4)); - - ASSERT_EQ(t1, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("a")); - ASSERT_FALSE(tree.unregisterToken("a")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(t3, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("b")); - ASSERT_FALSE(tree.unregisterToken("b")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(t2, tree.hasToken("ab")); - ASSERT_EQ(EmptyToken, tree.hasToken("b")); - - ASSERT_TRUE(tree.unregisterToken("ab")); - ASSERT_FALSE(tree.unregisterToken("ab")); - - ASSERT_EQ(EmptyToken, tree.hasToken("a")); - ASSERT_EQ(EmptyToken, tree.hasToken("ab")); - ASSERT_EQ(EmptyToken, tree.hasToken("b")); -} -} - -- cgit v1.2.3 From ce5ab62b564476dfacba33507f1541166fda2bfb Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:47:40 +0100 Subject: renamed osdm to osml and osdmx to osxml --- src/formats/osdm/OsdmStreamParser.cpp | 640 --------------- src/formats/osdm/OsdmStreamParser.hpp | 351 -------- src/formats/osdmx/OsdmxParser.cpp | 1435 --------------------------------- src/formats/osdmx/OsdmxParser.hpp | 55 -- src/formats/osml/OsmlParser.cpp | 57 ++ src/formats/osml/OsmlParser.hpp | 48 ++ src/formats/osml/OsmlStreamParser.cpp | 640 +++++++++++++++ src/formats/osml/OsmlStreamParser.hpp | 350 ++++++++ src/formats/osxml/OsxmlParser.cpp | 1435 +++++++++++++++++++++++++++++++++ src/formats/osxml/OsxmlParser.hpp | 55 ++ 10 files changed, 2585 insertions(+), 2481 deletions(-) delete mode 100644 src/formats/osdm/OsdmStreamParser.cpp delete mode 100644 src/formats/osdm/OsdmStreamParser.hpp delete mode 100644 src/formats/osdmx/OsdmxParser.cpp delete mode 100644 src/formats/osdmx/OsdmxParser.hpp create mode 100644 src/formats/osml/OsmlParser.cpp create mode 100644 src/formats/osml/OsmlParser.hpp create mode 100644 src/formats/osml/OsmlStreamParser.cpp create mode 100644 src/formats/osml/OsmlStreamParser.hpp create mode 100644 src/formats/osxml/OsxmlParser.cpp create mode 100644 src/formats/osxml/OsxmlParser.hpp (limited to 'src/formats') diff --git a/src/formats/osdm/OsdmStreamParser.cpp b/src/formats/osdm/OsdmStreamParser.cpp deleted file mode 100644 index 8cb8caf..0000000 --- a/src/formats/osdm/OsdmStreamParser.cpp +++ /dev/null @@ -1,640 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include -#include -#include - -#include "OsdmStreamParser.hpp" - -namespace ousia { - -/** - * Plain format default tokenizer. - */ -class PlainFormatTokens : public DynamicTokenizer { -public: - /** - * Id of the backslash token. - */ - TokenTypeId Backslash; - - /** - * Id of the line comment token. - */ - TokenTypeId LineComment; - - /** - * Id of the block comment start token. - */ - TokenTypeId BlockCommentStart; - - /** - * Id of the block comment end token. - */ - TokenTypeId BlockCommentEnd; - - /** - * Id of the field start token. - */ - TokenTypeId FieldStart; - - /** - * Id of the field end token. - */ - TokenTypeId FieldEnd; - - /** - * Registers the plain format tokens in the internal tokenizer. - */ - PlainFormatTokens() - { - Backslash = registerToken("\\"); - LineComment = registerToken("%"); - BlockCommentStart = registerToken("%{"); - BlockCommentEnd = registerToken("}%"); - FieldStart = registerToken("{"); - FieldEnd = registerToken("}"); - } -}; - -static const PlainFormatTokens Tokens; - -/** - * Class used internally to collect data issued via "DATA" event. - */ -class DataHandler { -private: - /** - * Internal character buffer. - */ - std::vector buf; - - /** - * Start location of the character data. - */ - SourceOffset start; - - /** - * End location of the character data. - */ - SourceOffset end; - -public: - /** - * Default constructor, initializes start and end with zeros. - */ - DataHandler() : start(0), end(0) {} - - /** - * Returns true if the internal buffer is empty. - * - * @return true if no characters were added to the internal buffer, false - * otherwise. - */ - bool isEmpty() { return buf.empty(); } - - /** - * Appends a single character to the internal buffer. - * - * @param c is the character that should be added to the internal buffer. - * @param charStart is the start position of the character. - * @param charEnd is the end position of the character. - */ - void append(char c, SourceOffset charStart, SourceOffset charEnd) - { - if (isEmpty()) { - start = charStart; - } - buf.push_back(c); - end = charEnd; - } - - /** - * Appends a string to the internal buffer. - * - * @param s is the string that should be added to the internal buffer. - * @param stringStart is the start position of the string. - * @param stringEnd is the end position of the string. - */ - void append(const std::string &s, SourceOffset stringStart, - SourceOffset stringEnd) - { - if (isEmpty()) { - start = stringStart; - } - std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); - end = stringEnd; - } - - /** - * Converts the internal buffer to a variant with attached location - * information. - * - * @param sourceId is the source id which is needed for building the - * location information. - * @return a Variant with the internal buffer content as string and - * the correct start and end location. - */ - Variant toVariant(SourceId sourceId) - { - Variant res = Variant::fromString(std::string(buf.data(), buf.size())); - res.setLocation({sourceId, start, end}); - return res; - } -}; - -OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) - : reader(reader), logger(logger), tokenizer(Tokens) -{ - // Place an intial command representing the complete file on the stack - commands.push(Command{"", Variant::mapType{}, true, true, true}); -} - -Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) -{ - bool first = true; - bool hasCharSiceNSSep = false; - std::vector identifier; - size_t end = reader.getPeekOffset(); - char c, c2; - while (reader.peek(c)) { - // Abort if this character is not a valid identifer character - if ((first && Utils::isIdentifierStartCharacter(c)) || - (!first && Utils::isIdentifierCharacter(c))) { - identifier.push_back(c); - } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && - Utils::isIdentifierStartCharacter(c2)) { - identifier.push_back(c); - } else { - if (c == ':' && allowNSSep) { - logger.error( - "Expected character before and after namespace separator " - "\":\"", - reader); - } - reader.resetPeek(); - break; - } - - // This is no longer the first character - first = false; - - // Advance the hasCharSiceNSSep flag - hasCharSiceNSSep = allowNSSep && (c != ':'); - - end = reader.getPeekOffset(); - reader.consumePeek(); - } - - // Return the identifier at its location - Variant res = - Variant::fromString(std::string(identifier.data(), identifier.size())); - res.setLocation({reader.getSourceId(), start, end}); - return res; -} - -OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() -{ - // Expect a '{' after the command - reader.consumeWhitespace(); - if (!reader.expect('{')) { - logger.error("Expected \"{\" after \\begin", reader); - return State::NONE; - } - - // Parse the name of the command that should be opened - Variant commandName = parseIdentifier(reader.getOffset(), true); - if (commandName.asString().empty()) { - logger.error("Expected identifier", commandName); - return State::ERROR; - } - - // Check whether the next character is a '#', indicating the start of the - // command name - Variant commandArgName; - SourceOffset start = reader.getOffset(); - if (reader.expect('#')) { - commandArgName = parseIdentifier(start); - if (commandArgName.asString().empty()) { - logger.error("Expected identifier after \"#\"", commandArgName); - } - } - - if (!reader.expect('}')) { - logger.error("Expected \"}\"", reader); - return State::ERROR; - } - - // Parse the arguments - Variant commandArguments = parseCommandArguments(std::move(commandArgName)); - - // Push the command onto the command stack - pushCommand(std::move(commandName), std::move(commandArguments), true); - - return State::COMMAND; -} - -static bool checkStillInField(const OsdmStreamParser::Command &cmd, - const Variant &endName, Logger &logger) -{ - if (cmd.inField && !cmd.inRangeField) { - logger.error(std::string("\\end in open field of command \"") + - cmd.name.asString() + std::string("\""), - endName); - logger.note(std::string("Open command started here:"), cmd.name); - return true; - } - return false; -} - -OsdmStreamParser::State OsdmStreamParser::parseEndCommand() -{ - // Expect a '{' after the command - if (!reader.expect('{')) { - logger.error("Expected \"{\" after \\end", reader); - return State::NONE; - } - - // Fetch the name of the command that should be ended here - Variant name = parseIdentifier(reader.getOffset(), true); - - // Make sure the given command name is not empty - if (name.asString().empty()) { - logger.error("Expected identifier", name); - return State::ERROR; - } - - // Make sure the command name is terminated with a '}' - if (!reader.expect('}')) { - logger.error("Expected \"}\"", reader); - return State::ERROR; - } - - // Unroll the command stack up to the last range command - while (!commands.top().hasRange) { - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; - } - commands.pop(); - } - - // Make sure we're not in an open field of this command - if (checkStillInField(commands.top(), name, logger)) { - return State::ERROR; - } - - // Special error message if the top-level command is reached - if (commands.size() == 1) { - logger.error(std::string("Cannot end command \"") + name.asString() + - std::string("\" here, no command open"), - name); - return State::ERROR; - } - - // Inform the about command mismatches - const Command &cmd = commands.top(); - if (commands.top().name.asString() != name.asString()) { - logger.error(std::string("Trying to end command \"") + - cmd.name.asString() + - std::string("\", but open command is \"") + - name.asString() + std::string("\""), - name); - logger.note("Last command was opened here:", cmd.name); - return State::ERROR; - } - - // Set the location to the location of the command that was ended, then end - // the current command - location = name.getLocation(); - commands.pop(); - return cmd.inRangeField ? State::FIELD_END : State::NONE; -} - -Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) -{ - // Parse the arguments using the universal VariantReader - Variant commandArguments; - if (reader.expect('[')) { - auto res = VariantReader::parseObject(reader, logger, ']'); - commandArguments = res.second; - } else { - commandArguments = Variant::mapType{}; - } - - // Insert the parsed name, make sure "name" was not specified in the - // arguments - if (commandArgName.isString()) { - auto res = - commandArguments.asMap().emplace("name", std::move(commandArgName)); - if (!res.second) { - logger.error("Name argument specified multiple times", - SourceLocation{}, MessageMode::NO_CONTEXT); - logger.note("First occurance is here: ", commandArgName); - logger.note("Second occurance is here: ", res.first->second); - } - } - return commandArguments; -} - -void OsdmStreamParser::pushCommand(Variant commandName, - Variant commandArguments, bool hasRange) -{ - // Store the location on the stack - location = commandName.getLocation(); - - // Place the command on the command stack, remove the last commands if we're - // not currently inside a field of these commands - while (!commands.top().inField) { - commands.pop(); - } - commands.push(Command{std::move(commandName), std::move(commandArguments), - hasRange, false, false}); -} - -OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) -{ - // Parse the commandName as a first identifier - Variant commandName = parseIdentifier(start, true); - if (commandName.asString().empty()) { - logger.error("Empty command name", reader); - return State::NONE; - } - - // Handle the special "begin" and "end" commands - const auto commandNameComponents = - Utils::split(commandName.asString(), ':'); - const bool isBegin = commandNameComponents[0] == "begin"; - const bool isEnd = commandNameComponents[0] == "end"; - if (isBegin || isEnd) { - if (commandNameComponents.size() > 1) { - logger.error( - "Special commands \"\\begin\" and \"\\end\" may not contain a " - "namespace separator \":\"", - commandName); - } - if (isBegin) { - return parseBeginCommand(); - } else if (isEnd) { - return parseEndCommand(); - } - } - - // Check whether the next character is a '#', indicating the start of the - // command name - Variant commandArgName; - start = reader.getOffset(); - if (reader.expect('#')) { - commandArgName = parseIdentifier(start); - if (commandArgName.asString().empty()) { - logger.error("Expected identifier after \"#\"", commandArgName); - } - } - - // Parse the arugments - Variant commandArguments = parseCommandArguments(std::move(commandArgName)); - - // Push the command onto the command stack - pushCommand(std::move(commandName), std::move(commandArguments), false); - - return State::COMMAND; -} - -void OsdmStreamParser::parseBlockComment() -{ - DynamicToken token; - size_t depth = 1; - while (tokenizer.read(reader, token)) { - if (token.type == Tokens.BlockCommentEnd) { - depth--; - if (depth == 0) { - return; - } - } - if (token.type == Tokens.BlockCommentStart) { - depth++; - } - } - - // Issue an error if the file ends while we are in a block comment - logger.error("File ended while being in a block comment", reader); -} - -void OsdmStreamParser::parseLineComment() -{ - char c; - while (reader.read(c)) { - if (c == '\n') { - return; - } - } -} - -bool OsdmStreamParser::checkIssueData(DataHandler &handler) -{ - if (!handler.isEmpty()) { - data = handler.toVariant(reader.getSourceId()); - location = data.getLocation(); - reader.resetPeek(); - return true; - } - return false; -} - -bool OsdmStreamParser::checkIssueFieldStart() -{ - // Fetch the current command, and check whether we're currently inside a - // field of this command - Command &cmd = commands.top(); - if (!cmd.inField) { - // If this is a range command, we're now implicitly inside the field of - // this command -- we'll have to issue a field start command! - if (cmd.hasRange) { - cmd.inField = true; - cmd.inRangeField = true; - reader.resetPeek(); - return true; - } - - // This was not a range command, so obviously we're now inside within - // a field of some command -- so unroll the commands stack until a - // command with open field is reached - while (!commands.top().inField) { - commands.pop(); - } - } - return false; -} - -OsdmStreamParser::State OsdmStreamParser::parse() -{ - // Handler for incomming data - DataHandler handler; - - // Read tokens until the outer loop should be left - DynamicToken token; - while (tokenizer.peek(reader, token)) { - const TokenTypeId type = token.type; - - // Special handling for Backslash and Text - if (type == Tokens.Backslash) { - // Before appending anything to the output data or starting a new - // command, check whether FIELD_START has to be issued, as the - // current command is a command with range - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } - - // Check whether a command starts now, without advancing the peek - // cursor - char c; - if (!reader.fetchPeek(c)) { - logger.error("Trailing backslash at the end of the file.", - token); - return State::END; - } - - // Try to parse a command - if (Utils::isIdentifierStartCharacter(c)) { - // Make sure to issue any data before it is to late - if (checkIssueData(handler)) { - return State::DATA; - } - - // Parse the actual command - State res = parseCommand(token.location.getStart()); - switch (res) { - case State::ERROR: - throw LoggableException( - "Last error was irrecoverable, ending parsing " - "process"); - case State::NONE: - continue; - default: - return res; - } - } - - // This was not a special character, just append the given character - // to the data buffer, use the escape character start as start - // location and the peek offset as end location - reader.peek(c); // Peek the previously fetched character - handler.append(c, token.location.getStart(), - reader.getPeekOffset()); - reader.consumePeek(); - continue; - } else if (type == TextToken) { - // Check whether FIELD_START has to be issued before appending text - if (checkIssueFieldStart()) { - location = token.location; - return State::FIELD_START; - } - - // Append the text to the data handler - handler.append(token.content, token.location.getStart(), - token.location.getEnd()); - - reader.consumePeek(); - continue; - } - - // A non-text token was reached, make sure all pending data commands - // have been issued - if (checkIssueData(handler)) { - return State::DATA; - } - - // We will handle the token now, consume the peeked characters - reader.consumePeek(); - - // Update the location to the current token location - location = token.location; - - if (token.type == Tokens.LineComment) { - parseLineComment(); - } else if (token.type == Tokens.BlockCommentStart) { - parseBlockComment(); - } else if (token.type == Tokens.FieldStart) { - Command &cmd = commands.top(); - if (!cmd.inField) { - cmd.inField = true; - return State::FIELD_START; - } - logger.error( - "Got field start token \"{\", but no command for which to " - "start the field. Did you mean \"\\{\"?", - token); - } else if (token.type == Tokens.FieldEnd) { - // Try to end an open field of the current command -- if the current - // command is not inside an open field, end this command and try to - // close the next one - for (int i = 0; i < 2 && commands.size() > 1; i++) { - Command &cmd = commands.top(); - if (!cmd.inRangeField) { - if (cmd.inField) { - cmd.inField = false; - return State::FIELD_END; - } - commands.pop(); - } else { - break; - } - } - logger.error( - "Got field end token \"}\", but there is no field to end. Did " - "you mean \"\\}\"?", - token); - } else { - logger.error("Unexpected token \"" + token.content + "\"", token); - } - } - - // Issue available data - if (checkIssueData(handler)) { - return State::DATA; - } - - // Make sure all open commands and fields have been ended at the end of the - // stream - while (commands.size() > 1) { - Command &cmd = commands.top(); - if (cmd.inField || cmd.hasRange) { - logger.error("Reached end of stream, but command \"" + - cmd.name.asString() + "\" has not been ended", - cmd.name); - } - commands.pop(); - } - - location = SourceLocation{reader.getSourceId(), reader.getOffset()}; - return State::END; -} - -const Variant &OsdmStreamParser::getCommandName() -{ - return commands.top().name; -} - -const Variant &OsdmStreamParser::getCommandArguments() -{ - return commands.top().arguments; -} -} - diff --git a/src/formats/osdm/OsdmStreamParser.hpp b/src/formats/osdm/OsdmStreamParser.hpp deleted file mode 100644 index 48d8fb7..0000000 --- a/src/formats/osdm/OsdmStreamParser.hpp +++ /dev/null @@ -1,351 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file OsdmStreamParser.hpp - * - * Provides classes for low-level classes for reading the TeX-esque osdm - * format. The class provided here does not build any model objects and does not - * implement the Parser interface. - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ -#define _OUSIA_OSDM_STREAM_PARSER_HPP_ - -#include - -#include - -#include "DynamicTokenizer.hpp" - -namespace ousia { - -// Forward declarations -class CharReader; -class Logger; -class DataHandler; - -/** - * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm - * format. The parser is constructed around a "parse" function, which reads data - * from the underlying CharReader until a new state is reached and indicates - * this state in a return value. The calling code then has to pull corresponding - * data from the stream reader. The reader makes sure the incommind file is - * syntactically valid and tries to recorver from most errors. If an error is - * irrecoverable (this is the case for errors with wrong nesting of commands or - * fields, as this would lead to too many consecutive errors) a - * LoggableException is thrown. - */ -class OsdmStreamParser { -public: - /** - * Enum used to indicate which state the OsdmStreamParser class is in - * after calling the "parse" function. - */ - enum class State { - /** - * State returned if a fully featured command has been read. A command - * consists of the command name and its arguments (which optionally - * includes the name). - */ - COMMAND, - - /** - * State returned if data is given. The reader must decide which field - * or command this should be routed to. Trailing or leading whitespace - * has been removed. Only called if the data is non-empty. - */ - DATA, - - /** - * A user-defined entity has been found. The entity sequence is stored - * in the command name. - */ - ENTITY, - - /** - * State returned if an annotation was started. An annotation consists - * of the command name and its arguments (which optionally include the - * name). - */ - ANNOTATION_START, - - /** - * State returned if an annotation ends. The reader indicates which - * annotation ends. - */ - ANNOTATION_END, - - /** - * State returned if a new field started. The reader assures that the - * current field ends before a new field is started and that the field - * is not started if data has been given outside of a field. The - * field number is set to the current field index. - */ - FIELD_START, - - /** - * State returned if the current field ends. The reader assures that a - * field was actually open. - */ - FIELD_END, - - /** - * The end of the stream has been reached. - */ - END, - - /** - * Returned from internal functions if nothing should be done. - */ - NONE, - - /** - * Returned from internal function to indicate irrecoverable errors. - */ - ERROR - }; - - /** - * Entry used for the command stack. - */ - struct Command { - /** - * Name and location of the current command. - */ - Variant name; - - /** - * Arguments that were passed to the command. - */ - Variant arguments; - - /** - * Set to true if this is a command with clear begin and end. - */ - bool hasRange; - - /** - * Set to true if we are currently inside a field of this command. - */ - bool inField; - - /** - * Set to true if we are currently in the range field of the command - * (implies inField being set to true). - */ - bool inRangeField; - - /** - * Default constructor. - */ - Command() : hasRange(false), inField(false), inRangeField(false) {} - - /** - * Constructor of the Command class. - * - * @param name is a string variant with name and location of the - * command. - * @param arguments is a map variant with the arguments given to the - * command. - * @param hasRange should be set to true if this is a command with - * explicit range. - * @param inField is set to true if we currently are inside a field - * of this command. - * @param inRangeField is set to true if we currently inside the outer - * field of the command. - */ - Command(Variant name, Variant arguments, bool hasRange, bool inField, - bool inRangeField) - : name(std::move(name)), - arguments(std::move(arguments)), - hasRange(hasRange), - inField(inField), - inRangeField(inRangeField) - { - } - }; - -private: - /** - * Reference to the CharReader instance from which the incomming bytes are - * read. - */ - CharReader &reader; - - /** - * Reference at the logger instance to which all error messages are sent. - */ - Logger &logger; - - /** - * Tokenizer instance used to read individual tokens from the text. - */ - DynamicTokenizer tokenizer; - - /** - * Stack containing the current commands. - */ - std::stack commands; - - /** - * Variant containing the data that has been read (always is a string, - * contains the exact location of the data in the source file). - */ - Variant data; - - /** - * Contains the location of the last token. - */ - SourceLocation location; - - /** - * Contains the field index of the current command. - */ - size_t fieldIdx; - - /** - * Function used internall to parse an identifier. - * - * @param start is the start byte offset of the identifier (including the - * backslash). - * @param allowNSSep should be set to true if the namespace separator is - * allowed in the identifier name. Issues error if the namespace separator - * is placed incorrectly. - */ - Variant parseIdentifier(size_t start, bool allowNSSep = false); - - /** - * Function used internally to handle the special "\begin" command. - */ - State parseBeginCommand(); - - /** - * Function used internally to handle the special "\end" command. - */ - State parseEndCommand(); - - /** - * Pushes the parsed command onto the command stack. - */ - void pushCommand(Variant commandName, Variant commandArguments, - bool hasRange); - - /** - * Parses the command arguments. - */ - Variant parseCommandArguments(Variant commandArgName); - - /** - * Function used internally to parse a command. - * - * @param start is the start byte offset of the command (including the - * backslash) - * @return true if a command was actuall parsed, false otherwise. - */ - State parseCommand(size_t start); - - /** - * Function used internally to parse a block comment. - */ - void parseBlockComment(); - - /** - * Function used internally to parse a generic comment. - */ - void parseLineComment(); - - /** - * Checks whether there is any data pending to be issued, if yes, issues it. - * - * @param handler is the data handler that contains the data that may be - * returned to the user. - * @return true if there was any data and DATA should be returned by the - * parse function, false otherwise. - */ - bool checkIssueData(DataHandler &handler); - - /** - * Called before any data is appended to the internal data handler. Checks - * whether a new field should be started or implicitly ended. - * - * @return true if FIELD_START should be returned by the parse function. - */ - bool checkIssueFieldStart(); - -public: - /** - * Constructor of the OsdmStreamParser class. Attaches the new - * OsdmStreamParser to the given CharReader and Logger instances. - * - * @param reader is the reader instance from which incomming characters - * should be read. - * @param logger is the logger instance to which errors should be written. - */ - OsdmStreamParser(CharReader &reader, Logger &logger); - - /** - * Continues parsing. Returns one of the states defined in the State enum. - * Callers should stop once the State::END state is reached. Use the getter - * functions to get more information about the current state, such as the - * command name or the data or the current field index. - * - * @return the new state the parser has reached. - */ - State parse(); - - /** - * Returns a reference at the internally stored data. Only valid if - * State::DATA was returned by the "parse" function. - * - * @return a reference at a variant containing the data parsed by the - * "parse" function. - */ - const Variant &getData() { return data; } - - /** - * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. - * - * @return a reference at a variant containing name and location of the - * parsed command. - */ - const Variant &getCommandName(); - - /** - * Returns a reference at the internally stored command name. Only valid if - * State::COMMAND was returned by the "parse" function. - * - * @return a reference at a variant containing arguments given to the - * command. - */ - const Variant &getCommandArguments(); - - /** - * Returns a reference at the char reader. - * - * @return the last internal token location. - */ - SourceLocation &getLocation() { return location; } -}; -} - -#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ - diff --git a/src/formats/osdmx/OsdmxParser.cpp b/src/formats/osdmx/OsdmxParser.cpp deleted file mode 100644 index c46d9de..0000000 --- a/src/formats/osdmx/OsdmxParser.cpp +++ /dev/null @@ -1,1435 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "XmlParser.hpp" - -namespace ousia { - -/* HeadNode Helper class */ - -namespace { -class HeadNode : public Node { -public: - using Node::Node; -}; -} - -namespace RttiTypes { -static Rtti HeadNode = RttiBuilder("HeadNode"); -} - -/* Element Handler Classes */ - -class DocumentHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted document = - project()->createDocument(args["name"].asString()); - document->setLocation(location()); - scope().push(document); - scope().setFlag(ParserFlag::POST_HEAD, false); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DocumentHandler{handlerData}; - } -}; - -class DocumentField : public Node { -public: - DocumentField(Manager &mgr, std::string name, Handle parent) - : Node(mgr, name, parent) - { - } -}; - -namespace RttiTypes { -const Rtti DocumentField = - RttiBuilder("DocumentField").parent(&Node); -} - -class DocumentChildHandler : public Handler { -public: - using Handler::Handler; - - void preamble(Handle parentNode, std::string &fieldName, - DocumentEntity *&parent, bool &inField) - { - // check if the parent in the structure tree was an explicit field - // reference. - inField = parentNode->isa(&RttiTypes::DocumentField); - if (inField) { - fieldName = parentNode->getName(); - parentNode = scope().selectOrThrow( - {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); - } else { - // if it wasn't an explicit reference, we use the default field. - fieldName = DEFAULT_FIELD_NAME; - } - // reference the parent entity explicitly. - parent = nullptr; - if (parentNode->isa(&RttiTypes::StructuredEntity)) { - parent = static_cast( - parentNode.cast().get()); - } else if (parentNode->isa(&RttiTypes::AnnotationEntity)) { - parent = static_cast( - parentNode.cast().get()); - } - } - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - Rooted parentNode = scope().selectOrThrow( - {&RttiTypes::Document, &RttiTypes::StructuredEntity, - &RttiTypes::AnnotationEntity, &RttiTypes::DocumentField}); - - std::string fieldName; - DocumentEntity *parent; - bool inField; - - preamble(parentNode, fieldName, parent, inField); - - // try to find a FieldDescriptor for the given tag if we are not in a - // field already. - // TODO: Consider fields of transparent classes - if (!inField && parent != nullptr && - parent->getDescriptor()->hasField(name())) { - Rooted field{new DocumentField( - parentNode->getManager(), fieldName, parentNode)}; - field->setLocation(location()); - scope().push(field); - return; - } - - // Otherwise create a new StructuredEntity - // TODO: Consider Anchors and AnnotationEntities - Rooted strct = scope().resolve( - Utils::split(name(), ':'), logger()); - if (strct == nullptr) { - // if we could not resolve the name, throw an exception. - throw LoggableException( - std::string("\"") + name() + "\" could not be resolved.", - location()); - } - - std::string name; - auto it = args.find("name"); - if (it != args.end()) { - name = it->second.asString(); - args.erase(it); - } - - Rooted entity; - if (parentNode->isa(&RttiTypes::Document)) { - entity = parentNode.cast()->createRootStructuredEntity( - strct, args, name); - } else { - // calculate a path if transparent entities are needed in between. - auto path = parent->getDescriptor()->pathTo(strct); - if (path.empty()) { - throw LoggableException( - std::string("An instance of \"") + strct->getName() + - "\" is not allowed as child of an instance of \"" + - parent->getDescriptor()->getName() + "\"", - location()); - } - - // create all transparent entities until the last field. - for (size_t p = 1; p < path.size() - 1; p = p + 2) { - parent = static_cast( - parent->createChildStructuredEntity( - path[p].cast(), - Variant::mapType{}, path[p - 1]->getName(), - "").get()); - } - entity = parent->createChildStructuredEntity(strct, args, fieldName, - name); - } - entity->setLocation(location()); - scope().push(entity); - } - - void end() override { scope().pop(); } - - void data(const std::string &data, int fieldIdx) override - { - Rooted parentNode = scope().selectOrThrow( - {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, - &RttiTypes::DocumentField}); - - std::string fieldName; - DocumentEntity *parent; - bool inField; - - preamble(parentNode, fieldName, parent, inField); - - // retrieve the correct FieldDescriptor. - // TODO: Consider fields of transparent classes - Rooted desc = parent->getDescriptor(); - Rooted field = desc->getFieldDescriptor(fieldName); - if (field == nullptr) { - logger().error( - std::string("Can't handle data because no field with name \"") + - fieldName + "\" exists in descriptor\"" + desc->getName() + - "\".", - location()); - return; - } - if (!field->isPrimitive()) { - logger().error(std::string("Can't handle data because field \"") + - fieldName + "\" of descriptor \"" + - desc->getName() + "\" is not primitive!", - location()); - return; - } - - // try to parse the content. - auto res = VariantReader::parseGenericString( - data, logger(), location().getSourceId(), location().getStart()); - if (!res.first) { - return; - } - // try to convert it to the correct type. - if (!field->getPrimitiveType()->build(res.second, logger())) { - return; - } - // add it as primitive content. - parent->createChildDocumentPrimitive(res.second, fieldName); - } - - static Handler *create(const HandlerData &handlerData) - { - return new DocumentChildHandler{handlerData}; - } -}; - -class TypesystemHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - // Create the typesystem instance - Rooted typesystem = - project()->createTypesystem(args["name"].asString()); - typesystem->setLocation(location()); - - // Push the typesystem onto the scope, set the POST_HEAD flag to true - scope().push(typesystem); - scope().setFlag(ParserFlag::POST_HEAD, false); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemHandler{handlerData}; - } -}; - -class TypesystemEnumHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - // Fetch the current typesystem and create the enum node - Rooted typesystem = scope().selectOrThrow(); - Rooted enumType = - typesystem->createEnumType(args["name"].asString()); - enumType->setLocation(location()); - - scope().push(enumType); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemEnumHandler{handlerData}; - } -}; - -class TypesystemEnumEntryHandler : public Handler { -public: - using Handler::Handler; - - std::string entry; - - void start(Variant::mapType &args) override {} - - void end() override - { - Rooted enumType = scope().selectOrThrow(); - enumType->addEntry(entry, logger()); - } - - void data(const std::string &data, int field) override - { - if (field != 0) { - // TODO: This should be stored in the HandlerData - logger().error("Enum entry only has one field."); - return; - } - entry.append(data); - } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemEnumEntryHandler{handlerData}; - } -}; - -class TypesystemStructHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - // Fetch the arguments used for creating this type - const std::string &name = args["name"].asString(); - const std::string &parent = args["parent"].asString(); - - // Fetch the current typesystem and create the struct node - Rooted typesystem = scope().selectOrThrow(); - Rooted structType = typesystem->createStructType(name); - structType->setLocation(location()); - - // Try to resolve the parent type and set it as parent structure - if (!parent.empty()) { - scope().resolve( - parent, structType, logger(), - [](Handle parent, Handle structType, - Logger &logger) { - if (parent != nullptr) { - structType.cast()->setParentStructure( - parent.cast(), logger); - } - }); - } - scope().push(structType); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemStructHandler{handlerData}; - } -}; - -class TypesystemStructFieldHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - // Read the argument values - const std::string &name = args["name"].asString(); - const std::string &type = args["type"].asString(); - const Variant &defaultValue = args["default"]; - const bool optional = - !(defaultValue.isObject() && defaultValue.asObject() == nullptr); - - Rooted structType = scope().selectOrThrow(); - Rooted attribute = - structType->createAttribute(name, defaultValue, optional, logger()); - attribute->setLocation(location()); - - // Try to resolve the type and default value - if (optional) { - scope().resolveTypeWithValue( - type, attribute, attribute->getDefaultValue(), logger(), - [](Handle type, Handle attribute, Logger &logger) { - if (type != nullptr) { - attribute.cast()->setType(type.cast(), - logger); - } - }); - } else { - scope().resolveType( - type, attribute, logger(), - [](Handle type, Handle attribute, Logger &logger) { - if (type != nullptr) { - attribute.cast()->setType(type.cast(), - logger); - } - }); - } - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemStructFieldHandler{handlerData}; - } -}; - -class TypesystemConstantHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - // Read the argument values - const std::string &name = args["name"].asString(); - const std::string &type = args["type"].asString(); - const Variant &value = args["value"]; - - Rooted typesystem = scope().selectOrThrow(); - Rooted constant = typesystem->createConstant(name, value); - constant->setLocation(location()); - - // Try to resolve the type - scope().resolveTypeWithValue( - type, constant, constant->getValue(), logger(), - [](Handle type, Handle constant, Logger &logger) { - if (type != nullptr) { - constant.cast()->setType(type.cast(), - logger); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new TypesystemConstantHandler{handlerData}; - } -}; - -/* - * Domain Handlers - */ - -class DomainHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted domain = - project()->createDomain(args["name"].asString()); - domain->setLocation(location()); - - scope().push(domain); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainHandler{handlerData}; - } -}; - -class DomainStructHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - Rooted domain = scope().selectOrThrow(); - - Rooted structuredClass = domain->createStructuredClass( - args["name"].asString(), args["cardinality"].asCardinality(), - nullptr, args["transparent"].asBool(), args["isRoot"].asBool()); - structuredClass->setLocation(location()); - - const std::string &isa = args["isa"].asString(); - if (!isa.empty()) { - scope().resolve( - isa, structuredClass, logger(), - [](Handle superclass, Handle structuredClass, - Logger &logger) { - if (superclass != nullptr) { - structuredClass.cast()->setSuperclass( - superclass.cast(), logger); - } - }); - } - - scope().push(structuredClass); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainStructHandler{handlerData}; - } -}; - -class DomainAnnotationHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - scope().setFlag(ParserFlag::POST_HEAD, true); - - Rooted domain = scope().selectOrThrow(); - - Rooted annotationClass = - domain->createAnnotationClass(args["name"].asString()); - annotationClass->setLocation(location()); - - scope().push(annotationClass); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainAnnotationHandler{handlerData}; - } -}; - -class DomainAttributesHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - // Fetch the current typesystem and create the struct node - Rooted parent = scope().selectOrThrow(); - - Rooted attrDesc = parent->getAttributesDescriptor(); - attrDesc->setLocation(location()); - - scope().push(attrDesc); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainAttributesHandler{handlerData}; - } -}; - -class DomainFieldHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - FieldDescriptor::FieldType type; - if (args["isSubtree"].asBool()) { - type = FieldDescriptor::FieldType::SUBTREE; - } else { - type = FieldDescriptor::FieldType::TREE; - } - - Rooted parent = scope().selectOrThrow(); - - Rooted field = parent->createFieldDescriptor( - type, args["name"].asString(), args["optional"].asBool()); - field->setLocation(location()); - - scope().push(field); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainFieldHandler{handlerData}; - } -}; - -class DomainFieldRefHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parent = scope().selectOrThrow(); - - const std::string &name = args["name"].asString(); - scope().resolve( - name, parent, logger(), - [](Handle field, Handle parent, Logger &logger) { - if (field != nullptr) { - parent.cast()->addFieldDescriptor( - field.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainFieldRefHandler{handlerData}; - } -}; - -class DomainPrimitiveHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parent = scope().selectOrThrow(); - - Rooted field = parent->createPrimitiveFieldDescriptor( - nullptr, args["name"].asString(), args["optional"].asBool()); - field->setLocation(location()); - - const std::string &type = args["type"].asString(); - scope().resolve( - type, field, logger(), - [](Handle type, Handle field, Logger &logger) { - if (type != nullptr) { - field.cast()->setPrimitiveType( - type.cast()); - } - }); - - scope().push(field); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainPrimitiveHandler{handlerData}; - } -}; - -class DomainChildHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted field = - scope().selectOrThrow(); - - const std::string &ref = args["ref"].asString(); - scope().resolve( - ref, field, logger(), - [](Handle child, Handle field, Logger &logger) { - if (child != nullptr) { - field.cast()->addChild( - child.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainChildHandler{handlerData}; - } -}; - -class DomainParent : public Node { -public: - DomainParent(Manager &mgr, std::string name, Handle parent) - : Node(mgr, name, parent) - { - } -}; - -namespace RttiTypes { -const Rtti DomainParent = - RttiBuilder("DomainParent").parent(&Node); -} - -class DomainParentHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted strct = - scope().selectOrThrow(); - - Rooted parent{new DomainParent( - strct->getManager(), args["name"].asString(), strct)}; - parent->setLocation(location()); - scope().push(parent); - } - - void end() override { scope().pop(); } - - static Handler *create(const HandlerData &handlerData) - { - return new DomainParentHandler{handlerData}; - } -}; - -class DomainParentFieldHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parentNameNode = - scope().selectOrThrow(); - FieldDescriptor::FieldType type; - if (args["isSubtree"].asBool()) { - type = FieldDescriptor::FieldType::SUBTREE; - } else { - type = FieldDescriptor::FieldType::TREE; - } - - const std::string &name = args["name"].asString(); - const bool optional = args["optional"].asBool(); - Rooted strct = - parentNameNode->getParent().cast(); - - // resolve the parent, create the declared field and add the declared - // StructuredClass as child to it. - scope().resolve( - parentNameNode->getName(), strct, logger(), - [type, name, optional](Handle parent, Handle strct, - Logger &logger) { - if (parent != nullptr) { - Rooted field = - parent.cast()->createFieldDescriptor( - type, name, optional); - field->addChild(strct.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainParentFieldHandler{handlerData}; - } -}; - -class DomainParentFieldRefHandler : public Handler { -public: - using Handler::Handler; - - void start(Variant::mapType &args) override - { - Rooted parentNameNode = - scope().selectOrThrow(); - - const std::string &name = args["name"].asString(); - Rooted strct = - parentNameNode->getParent().cast(); - auto loc = location(); - - // resolve the parent, get the referenced field and add the declared - // StructuredClass as child to it. - scope().resolve(parentNameNode->getName(), strct, logger(), - [name, loc](Handle parent, - Handle strct, - Logger &logger) { - if (parent != nullptr) { - auto res = parent.cast()->resolve( - &RttiTypes::FieldDescriptor, name); - if (res.size() != 1) { - logger.error( - std::string("Could not find referenced field ") + name, - loc); - return; - } - Rooted field = - res[0].node.cast(); - field->addChild(strct.cast()); - } - }); - } - - void end() override {} - - static Handler *create(const HandlerData &handlerData) - { - return new DomainParentFieldRefHandler{handlerData}; - } -}; - -/* - * Import and Include Handler - */ - -class ImportIncludeHandler : public Handler { -public: - using Handler::Handler; - - bool srcInArgs = false; - std::string rel; - std::string type; - std::string src; - - void start(Variant::mapType &args) override - { - rel = args["rel"].asString(); - type = args["type"].asString(); - src = args["src"].asString(); - srcInArgs = !src.empty(); - } - - void data(const std::string &data, int field) override - { - if (srcInArgs) { - logger().error("\"src\" attribute has already been set"); - return; - } - if (field != 0) { - logger().error("Command has only one field."); - return; - } - src.append(data); - } -}; - -class ImportHandler : public ImportIncludeHandler { -public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override - { - ImportIncludeHandler::start(args); - - // Make sure imports are still possible - if (scope().getFlag(ParserFlag::POST_HEAD)) { - logger().error("Imports must be listed before other commands.", - location()); - return; - } - } - - void end() override - { - // Fetch the last node and check whether an import is valid at this - // position - Rooted leaf = scope().getLeaf(); - if (leaf == nullptr || !leaf->isa(&RttiTypes::RootNode)) { - logger().error( - "Import not supported here, must be inside a document, domain " - "or typesystem command.", - location()); - return; - } - Rooted leafRootNode = leaf.cast(); - - // Perform the actual import, register the imported node within the leaf - // node - Rooted imported = - context().import(src, type, rel, leafRootNode->getReferenceTypes()); - if (imported != nullptr) { - leafRootNode->reference(imported); - } - } - - static Handler *create(const HandlerData &handlerData) - { - return new ImportHandler{handlerData}; - } -}; - -class IncludeHandler : public ImportIncludeHandler { -public: - using ImportIncludeHandler::ImportIncludeHandler; - - void start(Variant::mapType &args) override - { - ImportIncludeHandler::start(args); - } - - void end() override - { - context().include(src, type, rel, {&RttiTypes::Node}); - } - - static Handler *create(const HandlerData &handlerData) - { - return new IncludeHandler{handlerData}; - } -}; - -namespace ParserStates { -/* Document states */ -static const ParserState Document = - ParserStateBuilder() - .parent(&None) - .createdNodeType(&RttiTypes::Document) - .elementHandler(DocumentHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState DocumentChild = - ParserStateBuilder() - .parents({&Document, &DocumentChild}) - .createdNodeTypes({&RttiTypes::StructureNode, - &RttiTypes::AnnotationEntity, - &RttiTypes::DocumentField}) - .elementHandler(DocumentChildHandler::create); - -/* Domain states */ -static const ParserState Domain = ParserStateBuilder() - .parents({&None, &Document}) - .createdNodeType(&RttiTypes::Domain) - .elementHandler(DomainHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainStruct = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::StructuredClass) - .elementHandler(DomainStructHandler::create) - .arguments({Argument::String("name"), - Argument::Cardinality("cardinality", Cardinality::any()), - Argument::Bool("isRoot", false), - Argument::Bool("transparent", false), - Argument::String("isa", "")}); - -static const ParserState DomainAnnotation = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::AnnotationClass) - .elementHandler(DomainAnnotationHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainAttributes = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(DomainAttributesHandler::create) - .arguments({}); - -static const ParserState DomainAttribute = - ParserStateBuilder() - .parent(&DomainAttributes) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState DomainField = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainFieldRef = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldRefHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); - -static const ParserState DomainStructPrimitive = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainPrimitiveHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME), - Argument::Bool("optional", false), - Argument::String("type")}); - -static const ParserState DomainStructChild = - ParserStateBuilder() - .parent(&DomainField) - .elementHandler(DomainChildHandler::create) - .arguments({Argument::String("ref")}); - -static const ParserState DomainStructParent = - ParserStateBuilder() - .parent(&DomainStruct) - .createdNodeType(&RttiTypes::DomainParent) - .elementHandler(DomainParentHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainStructParentField = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainStructParentFieldRef = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldRefHandler::create) - .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); - -/* Typesystem states */ -static const ParserState Typesystem = - ParserStateBuilder() - .parents({&None, &Domain}) - .createdNodeType(&RttiTypes::Typesystem) - .elementHandler(TypesystemHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState TypesystemEnum = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::EnumType) - .elementHandler(TypesystemEnumHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState TypesystemEnumEntry = - ParserStateBuilder() - .parent(&TypesystemEnum) - .elementHandler(TypesystemEnumEntryHandler::create) - .arguments({}); - -static const ParserState TypesystemStruct = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(TypesystemStructHandler::create) - .arguments({Argument::String("name"), Argument::String("parent", "")}); - -static const ParserState TypesystemStructField = - ParserStateBuilder() - .parent(&TypesystemStruct) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState TypesystemConstant = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::Constant) - .elementHandler(TypesystemConstantHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("value")}); - -/* Special states for import and include */ -static const ParserState Import = - ParserStateBuilder() - .parents({&Document, &Typesystem, &Domain}) - .elementHandler(ImportHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const ParserState Include = - ParserStateBuilder() - .parent(&All) - .elementHandler(IncludeHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const std::multimap XmlStates{ - {"document", &Document}, - {"*", &DocumentChild}, - {"domain", &Domain}, - {"struct", &DomainStruct}, - {"annotation", &DomainAnnotation}, - {"attributes", &DomainAttributes}, - {"attribute", &DomainAttribute}, - {"field", &DomainField}, - {"fieldRef", &DomainFieldRef}, - {"primitive", &DomainStructPrimitive}, - {"child", &DomainStructChild}, - {"parent", &DomainStructParent}, - {"field", &DomainStructParentField}, - {"fieldRef", &DomainStructParentFieldRef}, - {"typesystem", &Typesystem}, - {"enum", &TypesystemEnum}, - {"entry", &TypesystemEnumEntry}, - {"struct", &TypesystemStruct}, - {"field", &TypesystemStructField}, - {"constant", &TypesystemConstant}, - {"import", &Import}, - {"include", &Include}}; -} - -/** - * Structue containing the private data that is being passed to the - * XML-Handlers. - */ -struct XMLUserData { - /** - * Containing the depth of the current XML file - */ - size_t depth; - - /** - * Reference at the ParserStack instance. - */ - ParserStack *stack; - - /** - * Reference at the CharReader instance. - */ - CharReader *reader; - - /** - * Constructor of the XMLUserData struct. - * - * @param stack is a pointer at the ParserStack instance. - * @param reader is a pointer at the CharReader instance. - */ - XMLUserData(ParserStack *stack, CharReader *reader) - : depth(0), stack(stack), reader(reader) - { - } -}; - -/** - * Wrapper class around the XML_Parser pointer which safely frees it whenever - * the scope is left (e.g. because an exception was thrown). - */ -class ScopedExpatXmlParser { -private: - /** - * Internal pointer to the XML_Parser instance. - */ - XML_Parser parser; - -public: - /** - * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS - * from the expat library. Throws a parser exception if the XML parser - * cannot be initialized. - * - * @param encoding is the protocol-defined encoding passed to expat (or - * nullptr if expat should determine the encoding by itself). - */ - ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) - { - parser = XML_ParserCreate(encoding); - if (!parser) { - throw LoggableException{ - "Internal error: Could not create expat XML parser!"}; - } - } - - /** - * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. - */ - ~ScopedExpatXmlParser() - { - if (parser) { - XML_ParserFree(parser); - parser = nullptr; - } - } - - /** - * Returns the XML_Parser pointer. - */ - XML_Parser operator&() { return parser; } -}; - -/* Adapter Expat -> ParserStack */ - -static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) -{ - // Fetch the parser stack and the associated user data - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - // Fetch the current location in the XML file - size_t offs = XML_GetCurrentByteIndex(p); - - // Build the source location and update the default location of the - // current - // logger instance - SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; - stack->getContext().getLogger().setDefaultLocation(loc); - return loc; -} - -enum class XMLAttributeState { - IN_TAG_NAME, - SEARCH_ATTR, - IN_ATTR_NAME, - HAS_ATTR_NAME, - HAS_ATTR_EQUALS, - IN_ATTR_DATA -}; - -static std::map reconstructXMLAttributeOffsets( - CharReader &reader, SourceLocation location) -{ - std::map res; - - // Fork the reader, we don't want to mess up the XML parsing process, do we? - CharReaderFork readerFork = reader.fork(); - - // Move the read cursor to the start location, abort if this does not work - size_t offs = location.getStart(); - if (!location.isValid() || offs != readerFork.seek(offs)) { - return res; - } - - // Now all we need to do is to implement one half of an XML parser. As this - // is inherently complicated we'll totaly fail at it. Don't care. All we - // want to get is those darn offsets for pretty error messages... (and we - // can assume the XML is valid as it was already read by expat) - XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; - char c; - std::stringstream attrName; - while (readerFork.read(c)) { - // Abort at the end of the tag - if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { - return res; - } - - // One state machine to rule them all, one state machine to find them, - // One state machine to bring them all and in the darkness bind them - // (the byte offsets) - switch (state) { - case XMLAttributeState::IN_TAG_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::SEARCH_ATTR; - } - break; - case XMLAttributeState::SEARCH_ATTR: - if (!Utils::isWhitespace(c)) { - state = XMLAttributeState::IN_ATTR_NAME; - attrName << c; - } - break; - case XMLAttributeState::IN_ATTR_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::HAS_ATTR_NAME; - } else if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - } else { - attrName << c; - } - break; - case XMLAttributeState::HAS_ATTR_NAME: - if (!Utils::isWhitespace(c)) { - if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - break; - } - // Well, this is a strange XML file... We expected to - // see a '=' here! Try to continue with the - // "HAS_ATTR_EQUALS" state as this state will hopefully - // inlcude some error recovery - } else { - // Skip whitespace here - break; - } - // Fallthrough - case XMLAttributeState::HAS_ATTR_EQUALS: - if (!Utils::isWhitespace(c)) { - if (c == '"') { - // Here we are! We have found the beginning of an - // attribute. Let's quickly lock the current offset away - // in the result map - res.emplace(attrName.str(), - SourceLocation{reader.getSourceId(), - readerFork.getOffset()}); - attrName.str(std::string{}); - state = XMLAttributeState::IN_ATTR_DATA; - } else { - // No, this XML file is not well formed. Assume we're in - // an attribute name once again - attrName.str(std::string{&c, 1}); - state = XMLAttributeState::IN_ATTR_NAME; - } - } - break; - case XMLAttributeState::IN_ATTR_DATA: - if (c == '"') { - // We're at the end of the attribute data, start anew - state = XMLAttributeState::SEARCH_ATTR; - } - break; - } - } - return res; -} - -static void xmlStartElementHandler(void *p, const XML_Char *name, - const XML_Char **attrs) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - SourceLocation loc = syncLoggerPosition(parser); - - // Read the argument locations -- this is only a stupid and slow hack, - // but it is necessary, as expat doesn't give use the byte offset of the - // arguments. - std::map offs = - reconstructXMLAttributeOffsets(*userData->reader, loc); - - // Assemble the arguments - Variant::mapType args; - - const XML_Char **attr = attrs; - while (*attr) { - // Convert the C string to a std::string - const std::string key{*(attr++)}; - - // Search the location of the key - SourceLocation keyLoc; - auto it = offs.find(key); - if (it != offs.end()) { - keyLoc = it->second; - } - - // Parse the string, pass the location of the key - std::pair value = VariantReader::parseGenericString( - *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), - keyLoc.getStart()); - args.emplace(key, value.second); - } - - // Call the start function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->start(std::string(name), args, loc); - } - - // Increment the current depth - userData->depth++; -} - -static void xmlEndElementHandler(void *p, const XML_Char *name) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - syncLoggerPosition(parser); - - // Decrement the current depth - userData->depth--; - - // Call the end function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->end(); - } -} - -static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - size_t ulen = len > 0 ? static_cast(len) : 0; - syncLoggerPosition(parser, ulen); - const std::string data = Utils::trim(std::string{s, ulen}); - if (!data.empty()) { - stack->data(data); - } -} - -/* Class XmlParser */ - -void XmlParser::doParse(CharReader &reader, ParserContext &ctx) -{ - // Create the parser object - ScopedExpatXmlParser p{"UTF-8"}; - - // Create the parser stack instance, if we're starting on a non-empty scope, - // try to deduce the parser state - ParserStack stack(ctx, ParserStates::XmlStates); - if (!ctx.getScope().isEmpty()) { - if (!stack.deduceState()) { - return; - } - } - - // Pass the reference to the ParserStack to the XML handler - XMLUserData data(&stack, &reader); - XML_SetUserData(&p, &data); - XML_UseParserAsHandlerArg(&p); - - // Set the callback functions - XML_SetStartElementHandler(&p, xmlStartElementHandler); - XML_SetEndElementHandler(&p, xmlEndElementHandler); - XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); - - // Feed data into expat while there is data to process - constexpr size_t BUFFER_SIZE = 64 * 1024; - while (true) { - // Fetch a buffer from expat for the input data - char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); - if (!buf) { - throw LoggableException{ - "Internal error: XML parser out of memory!"}; - } - - // Read into the buffer - size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); - - // Parse the data and handle any XML error - if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { - // Fetch the xml parser byte offset - size_t offs = XML_GetCurrentByteIndex(&p); - - // Throw a corresponding exception - XML_Error code = XML_GetErrorCode(&p); - std::string msg = std::string{XML_ErrorString(code)}; - throw LoggableException{"XML: " + msg, - SourceLocation{ctx.getSourceId(), offs}}; - } - - // Abort once there are no more bytes in the stream - if (bytesRead == 0) { - break; - } - } -} -} - diff --git a/src/formats/osdmx/OsdmxParser.hpp b/src/formats/osdmx/OsdmxParser.hpp deleted file mode 100644 index c8b6302..0000000 --- a/src/formats/osdmx/OsdmxParser.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -/** - * @file XmlParser.hpp - * - * Contains the parser responsible for reading Ousía XML Documents (extension - * oxd) and Ousía XML Modules (extension oxm). - * - * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) - */ - -#ifndef _OUSIA_XML_PARSER_HPP_ -#define _OUSIA_XML_PARSER_HPP_ - -#include - -namespace ousia { - -/** - * The XmlParser class implements parsing the various types of Ousía XML - * documents using the expat stream XML parser. - */ -class XmlParser : public Parser { -protected: - /** - * Parses the given input stream as XML file and returns the parsed - * top-level node. - * - * @param reader is the CharReader from which the input should be read. - * @param ctx is a reference to the ParserContext instance that should be - * used. - */ - void doParse(CharReader &reader, ParserContext &ctx) override; -}; - -} - -#endif /* _OUSIA_XML_PARSER_HPP_ */ - diff --git a/src/formats/osml/OsmlParser.cpp b/src/formats/osml/OsmlParser.cpp new file mode 100644 index 0000000..4973639 --- /dev/null +++ b/src/formats/osml/OsmlParser.cpp @@ -0,0 +1,57 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include + +#include "OsdmParser.hpp" +#include "OsdmStreamParser.hpp" + +namespace ousia { + +namespace { + +/** + * The OsdmParserImplementation class contains the actual implementation of the + * parsing process and is created in the "doParse" function of the OsdmParser. + + */ +class OsdmParserImplementation : public ParserStateCallbacks { +private: + /** + * OsdmStreamParser instance. + */ + OsdmStreamParser parser; + + /** + * Instance of the ParserStateStack. + */ + ParserStateStack stack; + +public: + OsdmParserImplementation parser(reader, ctx) : parser(reader), stack(ctx, std::multimap) +}; +} + +void OsdmParser::doParse(CharReader &reader, ParserContext &ctx) +{ + OsdmParserImplementation parser(reader, ctx); + parser.parse(); +} + +} diff --git a/src/formats/osml/OsmlParser.hpp b/src/formats/osml/OsmlParser.hpp new file mode 100644 index 0000000..37505b4 --- /dev/null +++ b/src/formats/osml/OsmlParser.hpp @@ -0,0 +1,48 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file OsdmParser.hpp + * + * Contains the parser of the osdm format, the standard plain-text format used + * by Ousía for documents. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_PARSER_HPP_ +#define _OUSIA_OSDM_PARSER_HPP_ + +#include + +namespace ousia { + +/** + * OsdmParser is a small wrapper implementing the Parser interface. The actual + * parsing is performed with the OsdmStreamParser in conjunction with the + * ParserStateStack. + */ +class OsdmParser : public Parser { +protected: + void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_OSDM_PARSER_HPP_ */ + diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp new file mode 100644 index 0000000..6a55f12 --- /dev/null +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -0,0 +1,640 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include + +#include "OsdmStreamParser.hpp" + +namespace ousia { + +/** + * Plain format default tokenizer. + */ +class PlainFormatTokens : public Tokenizer { +public: + /** + * Id of the backslash token. + */ + TokenTypeId Backslash; + + /** + * Id of the line comment token. + */ + TokenTypeId LineComment; + + /** + * Id of the block comment start token. + */ + TokenTypeId BlockCommentStart; + + /** + * Id of the block comment end token. + */ + TokenTypeId BlockCommentEnd; + + /** + * Id of the field start token. + */ + TokenTypeId FieldStart; + + /** + * Id of the field end token. + */ + TokenTypeId FieldEnd; + + /** + * Registers the plain format tokens in the internal tokenizer. + */ + PlainFormatTokens() + { + Backslash = registerToken("\\"); + LineComment = registerToken("%"); + BlockCommentStart = registerToken("%{"); + BlockCommentEnd = registerToken("}%"); + FieldStart = registerToken("{"); + FieldEnd = registerToken("}"); + } +}; + +static const PlainFormatTokens Tokens; + +/** + * Class used internally to collect data issued via "DATA" event. + */ +class DataHandler { +private: + /** + * Internal character buffer. + */ + std::vector buf; + + /** + * Start location of the character data. + */ + SourceOffset start; + + /** + * End location of the character data. + */ + SourceOffset end; + +public: + /** + * Default constructor, initializes start and end with zeros. + */ + DataHandler() : start(0), end(0) {} + + /** + * Returns true if the internal buffer is empty. + * + * @return true if no characters were added to the internal buffer, false + * otherwise. + */ + bool isEmpty() { return buf.empty(); } + + /** + * Appends a single character to the internal buffer. + * + * @param c is the character that should be added to the internal buffer. + * @param charStart is the start position of the character. + * @param charEnd is the end position of the character. + */ + void append(char c, SourceOffset charStart, SourceOffset charEnd) + { + if (isEmpty()) { + start = charStart; + } + buf.push_back(c); + end = charEnd; + } + + /** + * Appends a string to the internal buffer. + * + * @param s is the string that should be added to the internal buffer. + * @param stringStart is the start position of the string. + * @param stringEnd is the end position of the string. + */ + void append(const std::string &s, SourceOffset stringStart, + SourceOffset stringEnd) + { + if (isEmpty()) { + start = stringStart; + } + std::copy(s.c_str(), s.c_str() + s.size(), back_inserter(buf)); + end = stringEnd; + } + + /** + * Converts the internal buffer to a variant with attached location + * information. + * + * @param sourceId is the source id which is needed for building the + * location information. + * @return a Variant with the internal buffer content as string and + * the correct start and end location. + */ + Variant toVariant(SourceId sourceId) + { + Variant res = Variant::fromString(std::string(buf.data(), buf.size())); + res.setLocation({sourceId, start, end}); + return res; + } +}; + +OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) + : reader(reader), logger(logger), tokenizer(Tokens) +{ + // Place an intial command representing the complete file on the stack + commands.push(Command{"", Variant::mapType{}, true, true, true}); +} + +Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) +{ + bool first = true; + bool hasCharSiceNSSep = false; + std::vector identifier; + size_t end = reader.getPeekOffset(); + char c, c2; + while (reader.peek(c)) { + // Abort if this character is not a valid identifer character + if ((first && Utils::isIdentifierStartCharacter(c)) || + (!first && Utils::isIdentifierCharacter(c))) { + identifier.push_back(c); + } else if (c == ':' && hasCharSiceNSSep && reader.fetchPeek(c2) && + Utils::isIdentifierStartCharacter(c2)) { + identifier.push_back(c); + } else { + if (c == ':' && allowNSSep) { + logger.error( + "Expected character before and after namespace separator " + "\":\"", + reader); + } + reader.resetPeek(); + break; + } + + // This is no longer the first character + first = false; + + // Advance the hasCharSiceNSSep flag + hasCharSiceNSSep = allowNSSep && (c != ':'); + + end = reader.getPeekOffset(); + reader.consumePeek(); + } + + // Return the identifier at its location + Variant res = + Variant::fromString(std::string(identifier.data(), identifier.size())); + res.setLocation({reader.getSourceId(), start, end}); + return res; +} + +OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() +{ + // Expect a '{' after the command + reader.consumeWhitespace(); + if (!reader.expect('{')) { + logger.error("Expected \"{\" after \\begin", reader); + return State::NONE; + } + + // Parse the name of the command that should be opened + Variant commandName = parseIdentifier(reader.getOffset(), true); + if (commandName.asString().empty()) { + logger.error("Expected identifier", commandName); + return State::ERROR; + } + + // Check whether the next character is a '#', indicating the start of the + // command name + Variant commandArgName; + SourceOffset start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } + } + + if (!reader.expect('}')) { + logger.error("Expected \"}\"", reader); + return State::ERROR; + } + + // Parse the arguments + Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), true); + + return State::COMMAND; +} + +static bool checkStillInField(const OsdmStreamParser::Command &cmd, + const Variant &endName, Logger &logger) +{ + if (cmd.inField && !cmd.inRangeField) { + logger.error(std::string("\\end in open field of command \"") + + cmd.name.asString() + std::string("\""), + endName); + logger.note(std::string("Open command started here:"), cmd.name); + return true; + } + return false; +} + +OsdmStreamParser::State OsdmStreamParser::parseEndCommand() +{ + // Expect a '{' after the command + if (!reader.expect('{')) { + logger.error("Expected \"{\" after \\end", reader); + return State::NONE; + } + + // Fetch the name of the command that should be ended here + Variant name = parseIdentifier(reader.getOffset(), true); + + // Make sure the given command name is not empty + if (name.asString().empty()) { + logger.error("Expected identifier", name); + return State::ERROR; + } + + // Make sure the command name is terminated with a '}' + if (!reader.expect('}')) { + logger.error("Expected \"}\"", reader); + return State::ERROR; + } + + // Unroll the command stack up to the last range command + while (!commands.top().hasRange) { + if (checkStillInField(commands.top(), name, logger)) { + return State::ERROR; + } + commands.pop(); + } + + // Make sure we're not in an open field of this command + if (checkStillInField(commands.top(), name, logger)) { + return State::ERROR; + } + + // Special error message if the top-level command is reached + if (commands.size() == 1) { + logger.error(std::string("Cannot end command \"") + name.asString() + + std::string("\" here, no command open"), + name); + return State::ERROR; + } + + // Inform the about command mismatches + const Command &cmd = commands.top(); + if (commands.top().name.asString() != name.asString()) { + logger.error(std::string("Trying to end command \"") + + cmd.name.asString() + + std::string("\", but open command is \"") + + name.asString() + std::string("\""), + name); + logger.note("Last command was opened here:", cmd.name); + return State::ERROR; + } + + // Set the location to the location of the command that was ended, then end + // the current command + location = name.getLocation(); + commands.pop(); + return cmd.inRangeField ? State::FIELD_END : State::NONE; +} + +Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) +{ + // Parse the arguments using the universal VariantReader + Variant commandArguments; + if (reader.expect('[')) { + auto res = VariantReader::parseObject(reader, logger, ']'); + commandArguments = res.second; + } else { + commandArguments = Variant::mapType{}; + } + + // Insert the parsed name, make sure "name" was not specified in the + // arguments + if (commandArgName.isString()) { + auto res = + commandArguments.asMap().emplace("name", std::move(commandArgName)); + if (!res.second) { + logger.error("Name argument specified multiple times", + SourceLocation{}, MessageMode::NO_CONTEXT); + logger.note("First occurance is here: ", commandArgName); + logger.note("Second occurance is here: ", res.first->second); + } + } + return commandArguments; +} + +void OsdmStreamParser::pushCommand(Variant commandName, + Variant commandArguments, bool hasRange) +{ + // Store the location on the stack + location = commandName.getLocation(); + + // Place the command on the command stack, remove the last commands if we're + // not currently inside a field of these commands + while (!commands.top().inField) { + commands.pop(); + } + commands.push(Command{std::move(commandName), std::move(commandArguments), + hasRange, false, false}); +} + +OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) +{ + // Parse the commandName as a first identifier + Variant commandName = parseIdentifier(start, true); + if (commandName.asString().empty()) { + logger.error("Empty command name", reader); + return State::NONE; + } + + // Handle the special "begin" and "end" commands + const auto commandNameComponents = + Utils::split(commandName.asString(), ':'); + const bool isBegin = commandNameComponents[0] == "begin"; + const bool isEnd = commandNameComponents[0] == "end"; + if (isBegin || isEnd) { + if (commandNameComponents.size() > 1) { + logger.error( + "Special commands \"\\begin\" and \"\\end\" may not contain a " + "namespace separator \":\"", + commandName); + } + if (isBegin) { + return parseBeginCommand(); + } else if (isEnd) { + return parseEndCommand(); + } + } + + // Check whether the next character is a '#', indicating the start of the + // command name + Variant commandArgName; + start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } + } + + // Parse the arugments + Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), false); + + return State::COMMAND; +} + +void OsdmStreamParser::parseBlockComment() +{ + Token token; + size_t depth = 1; + while (tokenizer.read(reader, token)) { + if (token.type == Tokens.BlockCommentEnd) { + depth--; + if (depth == 0) { + return; + } + } + if (token.type == Tokens.BlockCommentStart) { + depth++; + } + } + + // Issue an error if the file ends while we are in a block comment + logger.error("File ended while being in a block comment", reader); +} + +void OsdmStreamParser::parseLineComment() +{ + char c; + while (reader.read(c)) { + if (c == '\n') { + return; + } + } +} + +bool OsdmStreamParser::checkIssueData(DataHandler &handler) +{ + if (!handler.isEmpty()) { + data = handler.toVariant(reader.getSourceId()); + location = data.getLocation(); + reader.resetPeek(); + return true; + } + return false; +} + +bool OsdmStreamParser::checkIssueFieldStart() +{ + // Fetch the current command, and check whether we're currently inside a + // field of this command + Command &cmd = commands.top(); + if (!cmd.inField) { + // If this is a range command, we're now implicitly inside the field of + // this command -- we'll have to issue a field start command! + if (cmd.hasRange) { + cmd.inField = true; + cmd.inRangeField = true; + reader.resetPeek(); + return true; + } + + // This was not a range command, so obviously we're now inside within + // a field of some command -- so unroll the commands stack until a + // command with open field is reached + while (!commands.top().inField) { + commands.pop(); + } + } + return false; +} + +OsdmStreamParser::State OsdmStreamParser::parse() +{ + // Handler for incomming data + DataHandler handler; + + // Read tokens until the outer loop should be left + Token token; + while (tokenizer.peek(reader, token)) { + const TokenTypeId type = token.type; + + // Special handling for Backslash and Text + if (type == Tokens.Backslash) { + // Before appending anything to the output data or starting a new + // command, check whether FIELD_START has to be issued, as the + // current command is a command with range + if (checkIssueFieldStart()) { + location = token.location; + return State::FIELD_START; + } + + // Check whether a command starts now, without advancing the peek + // cursor + char c; + if (!reader.fetchPeek(c)) { + logger.error("Trailing backslash at the end of the file.", + token); + return State::END; + } + + // Try to parse a command + if (Utils::isIdentifierStartCharacter(c)) { + // Make sure to issue any data before it is to late + if (checkIssueData(handler)) { + return State::DATA; + } + + // Parse the actual command + State res = parseCommand(token.location.getStart()); + switch (res) { + case State::ERROR: + throw LoggableException( + "Last error was irrecoverable, ending parsing " + "process"); + case State::NONE: + continue; + default: + return res; + } + } + + // This was not a special character, just append the given character + // to the data buffer, use the escape character start as start + // location and the peek offset as end location + reader.peek(c); // Peek the previously fetched character + handler.append(c, token.location.getStart(), + reader.getPeekOffset()); + reader.consumePeek(); + continue; + } else if (type == TextToken) { + // Check whether FIELD_START has to be issued before appending text + if (checkIssueFieldStart()) { + location = token.location; + return State::FIELD_START; + } + + // Append the text to the data handler + handler.append(token.content, token.location.getStart(), + token.location.getEnd()); + + reader.consumePeek(); + continue; + } + + // A non-text token was reached, make sure all pending data commands + // have been issued + if (checkIssueData(handler)) { + return State::DATA; + } + + // We will handle the token now, consume the peeked characters + reader.consumePeek(); + + // Update the location to the current token location + location = token.location; + + if (token.type == Tokens.LineComment) { + parseLineComment(); + } else if (token.type == Tokens.BlockCommentStart) { + parseBlockComment(); + } else if (token.type == Tokens.FieldStart) { + Command &cmd = commands.top(); + if (!cmd.inField) { + cmd.inField = true; + return State::FIELD_START; + } + logger.error( + "Got field start token \"{\", but no command for which to " + "start the field. Did you mean \"\\{\"?", + token); + } else if (token.type == Tokens.FieldEnd) { + // Try to end an open field of the current command -- if the current + // command is not inside an open field, end this command and try to + // close the next one + for (int i = 0; i < 2 && commands.size() > 1; i++) { + Command &cmd = commands.top(); + if (!cmd.inRangeField) { + if (cmd.inField) { + cmd.inField = false; + return State::FIELD_END; + } + commands.pop(); + } else { + break; + } + } + logger.error( + "Got field end token \"}\", but there is no field to end. Did " + "you mean \"\\}\"?", + token); + } else { + logger.error("Unexpected token \"" + token.content + "\"", token); + } + } + + // Issue available data + if (checkIssueData(handler)) { + return State::DATA; + } + + // Make sure all open commands and fields have been ended at the end of the + // stream + while (commands.size() > 1) { + Command &cmd = commands.top(); + if (cmd.inField || cmd.hasRange) { + logger.error("Reached end of stream, but command \"" + + cmd.name.asString() + "\" has not been ended", + cmd.name); + } + commands.pop(); + } + + location = SourceLocation{reader.getSourceId(), reader.getOffset()}; + return State::END; +} + +const Variant &OsdmStreamParser::getCommandName() +{ + return commands.top().name; +} + +const Variant &OsdmStreamParser::getCommandArguments() +{ + return commands.top().arguments; +} +} + diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp new file mode 100644 index 0000000..84674c0 --- /dev/null +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -0,0 +1,350 @@ +/* + Ousía + Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file OsdmStreamParser.hpp + * + * Provides classes for low-level classes for reading the TeX-esque osdm + * format. The class provided here does not build any model objects and does not + * implement the Parser interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ +#define _OUSIA_OSDM_STREAM_PARSER_HPP_ + +#include + +#include +#include + +namespace ousia { + +// Forward declarations +class CharReader; +class Logger; +class DataHandler; + +/** + * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm + * format. The parser is constructed around a "parse" function, which reads data + * from the underlying CharReader until a new state is reached and indicates + * this state in a return value. The calling code then has to pull corresponding + * data from the stream reader. The reader makes sure the incommind file is + * syntactically valid and tries to recorver from most errors. If an error is + * irrecoverable (this is the case for errors with wrong nesting of commands or + * fields, as this would lead to too many consecutive errors) a + * LoggableException is thrown. + */ +class OsdmStreamParser { +public: + /** + * Enum used to indicate which state the OsdmStreamParser class is in + * after calling the "parse" function. + */ + enum class State { + /** + * State returned if a fully featured command has been read. A command + * consists of the command name and its arguments (which optionally + * includes the name). + */ + COMMAND, + + /** + * State returned if data is given. The reader must decide which field + * or command this should be routed to. Trailing or leading whitespace + * has been removed. Only called if the data is non-empty. + */ + DATA, + + /** + * A user-defined entity has been found. The entity sequence is stored + * in the command name. + */ + ENTITY, + + /** + * State returned if an annotation was started. An annotation consists + * of the command name and its arguments (which optionally include the + * name). + */ + ANNOTATION_START, + + /** + * State returned if an annotation ends. The reader indicates which + * annotation ends. + */ + ANNOTATION_END, + + /** + * State returned if a new field started. The reader assures that the + * current field ends before a new field is started and that the field + * is not started if data has been given outside of a field. The + * field number is set to the current field index. + */ + FIELD_START, + + /** + * State returned if the current field ends. The reader assures that a + * field was actually open. + */ + FIELD_END, + + /** + * The end of the stream has been reached. + */ + END, + + /** + * Returned from internal functions if nothing should be done. + */ + NONE, + + /** + * Returned from internal function to indicate irrecoverable errors. + */ + ERROR + }; + + /** + * Entry used for the command stack. + */ + struct Command { + /** + * Name and location of the current command. + */ + Variant name; + + /** + * Arguments that were passed to the command. + */ + Variant arguments; + + /** + * Set to true if this is a command with clear begin and end. + */ + bool hasRange; + + /** + * Set to true if we are currently inside a field of this command. + */ + bool inField; + + /** + * Set to true if we are currently in the range field of the command + * (implies inField being set to true). + */ + bool inRangeField; + + /** + * Default constructor. + */ + Command() : hasRange(false), inField(false), inRangeField(false) {} + + /** + * Constructor of the Command class. + * + * @param name is a string variant with name and location of the + * command. + * @param arguments is a map variant with the arguments given to the + * command. + * @param hasRange should be set to true if this is a command with + * explicit range. + * @param inField is set to true if we currently are inside a field + * of this command. + * @param inRangeField is set to true if we currently inside the outer + * field of the command. + */ + Command(Variant name, Variant arguments, bool hasRange, bool inField, + bool inRangeField) + : name(std::move(name)), + arguments(std::move(arguments)), + hasRange(hasRange), + inField(inField), + inRangeField(inRangeField) + { + } + }; + +private: + /** + * Reference to the CharReader instance from which the incomming bytes are + * read. + */ + CharReader &reader; + + /** + * Reference at the logger instance to which all error messages are sent. + */ + Logger &logger; + + /** + * Tokenizer instance used to read individual tokens from the text. + */ + Tokenizer tokenizer; + + /** + * Stack containing the current commands. + */ + std::stack commands; + + /** + * Variant containing the data that has been read (always is a string, + * contains the exact location of the data in the source file). + */ + Variant data; + + /** + * Contains the location of the last token. + */ + SourceLocation location; + + /** + * Contains the field index of the current command. + */ + size_t fieldIdx; + + /** + * Function used internall to parse an identifier. + * + * @param start is the start byte offset of the identifier (including the + * backslash). + * @param allowNSSep should be set to true if the namespace separator is + * allowed in the identifier name. Issues error if the namespace separator + * is placed incorrectly. + */ + Variant parseIdentifier(size_t start, bool allowNSSep = false); + + /** + * Function used internally to handle the special "\begin" command. + */ + State parseBeginCommand(); + + /** + * Function used internally to handle the special "\end" command. + */ + State parseEndCommand(); + + /** + * Pushes the parsed command onto the command stack. + */ + void pushCommand(Variant commandName, Variant commandArguments, + bool hasRange); + + /** + * Parses the command arguments. + */ + Variant parseCommandArguments(Variant commandArgName); + + /** + * Function used internally to parse a command. + * + * @param start is the start byte offset of the command (including the + * backslash) + * @return true if a command was actuall parsed, false otherwise. + */ + State parseCommand(size_t start); + + /** + * Function used internally to parse a block comment. + */ + void parseBlockComment(); + + /** + * Function used internally to parse a generic comment. + */ + void parseLineComment(); + + /** + * Checks whether there is any data pending to be issued, if yes, issues it. + * + * @param handler is the data handler that contains the data that may be + * returned to the user. + * @return true if there was any data and DATA should be returned by the + * parse function, false otherwise. + */ + bool checkIssueData(DataHandler &handler); + + /** + * Called before any data is appended to the internal data handler. Checks + * whether a new field should be started or implicitly ended. + * + * @return true if FIELD_START should be returned by the parse function. + */ + bool checkIssueFieldStart(); + +public: + /** + * Constructor of the OsdmStreamParser class. Attaches the new + * OsdmStreamParser to the given CharReader and Logger instances. + * + * @param reader is the reader instance from which incomming characters + * should be read. + * @param logger is the logger instance to which errors should be written. + */ + OsdmStreamParser(CharReader &reader, Logger &logger); + + /** + * Continues parsing. Returns one of the states defined in the State enum. + * Callers should stop once the State::END state is reached. Use the getter + * functions to get more information about the current state, such as the + * command name or the data or the current field index. + * + * @return the new state the parser has reached. + */ + State parse(); + + /** + * Returns a reference at the internally stored data. Only valid if + * State::DATA was returned by the "parse" function. + * + * @return a reference at a variant containing the data parsed by the + * "parse" function. + */ + const Variant &getData() { return data; } + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing name and location of the + * parsed command. + */ + const Variant &getCommandName(); + + /** + * Returns a reference at the internally stored command name. Only valid if + * State::COMMAND was returned by the "parse" function. + * + * @return a reference at a variant containing arguments given to the + * command. + */ + const Variant &getCommandArguments(); + + /** + * Returns a reference at the char reader. + * + * @return the last internal token location. + */ + SourceLocation &getLocation() { return location; } +}; +} + +#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ + diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp new file mode 100644 index 0000000..c46d9de --- /dev/null +++ b/src/formats/osxml/OsxmlParser.cpp @@ -0,0 +1,1435 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "XmlParser.hpp" + +namespace ousia { + +/* HeadNode Helper class */ + +namespace { +class HeadNode : public Node { +public: + using Node::Node; +}; +} + +namespace RttiTypes { +static Rtti HeadNode = RttiBuilder("HeadNode"); +} + +/* Element Handler Classes */ + +class DocumentHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted document = + project()->createDocument(args["name"].asString()); + document->setLocation(location()); + scope().push(document); + scope().setFlag(ParserFlag::POST_HEAD, false); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DocumentHandler{handlerData}; + } +}; + +class DocumentField : public Node { +public: + DocumentField(Manager &mgr, std::string name, Handle parent) + : Node(mgr, name, parent) + { + } +}; + +namespace RttiTypes { +const Rtti DocumentField = + RttiBuilder("DocumentField").parent(&Node); +} + +class DocumentChildHandler : public Handler { +public: + using Handler::Handler; + + void preamble(Handle parentNode, std::string &fieldName, + DocumentEntity *&parent, bool &inField) + { + // check if the parent in the structure tree was an explicit field + // reference. + inField = parentNode->isa(&RttiTypes::DocumentField); + if (inField) { + fieldName = parentNode->getName(); + parentNode = scope().selectOrThrow( + {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity}); + } else { + // if it wasn't an explicit reference, we use the default field. + fieldName = DEFAULT_FIELD_NAME; + } + // reference the parent entity explicitly. + parent = nullptr; + if (parentNode->isa(&RttiTypes::StructuredEntity)) { + parent = static_cast( + parentNode.cast().get()); + } else if (parentNode->isa(&RttiTypes::AnnotationEntity)) { + parent = static_cast( + parentNode.cast().get()); + } + } + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + Rooted parentNode = scope().selectOrThrow( + {&RttiTypes::Document, &RttiTypes::StructuredEntity, + &RttiTypes::AnnotationEntity, &RttiTypes::DocumentField}); + + std::string fieldName; + DocumentEntity *parent; + bool inField; + + preamble(parentNode, fieldName, parent, inField); + + // try to find a FieldDescriptor for the given tag if we are not in a + // field already. + // TODO: Consider fields of transparent classes + if (!inField && parent != nullptr && + parent->getDescriptor()->hasField(name())) { + Rooted field{new DocumentField( + parentNode->getManager(), fieldName, parentNode)}; + field->setLocation(location()); + scope().push(field); + return; + } + + // Otherwise create a new StructuredEntity + // TODO: Consider Anchors and AnnotationEntities + Rooted strct = scope().resolve( + Utils::split(name(), ':'), logger()); + if (strct == nullptr) { + // if we could not resolve the name, throw an exception. + throw LoggableException( + std::string("\"") + name() + "\" could not be resolved.", + location()); + } + + std::string name; + auto it = args.find("name"); + if (it != args.end()) { + name = it->second.asString(); + args.erase(it); + } + + Rooted entity; + if (parentNode->isa(&RttiTypes::Document)) { + entity = parentNode.cast()->createRootStructuredEntity( + strct, args, name); + } else { + // calculate a path if transparent entities are needed in between. + auto path = parent->getDescriptor()->pathTo(strct); + if (path.empty()) { + throw LoggableException( + std::string("An instance of \"") + strct->getName() + + "\" is not allowed as child of an instance of \"" + + parent->getDescriptor()->getName() + "\"", + location()); + } + + // create all transparent entities until the last field. + for (size_t p = 1; p < path.size() - 1; p = p + 2) { + parent = static_cast( + parent->createChildStructuredEntity( + path[p].cast(), + Variant::mapType{}, path[p - 1]->getName(), + "").get()); + } + entity = parent->createChildStructuredEntity(strct, args, fieldName, + name); + } + entity->setLocation(location()); + scope().push(entity); + } + + void end() override { scope().pop(); } + + void data(const std::string &data, int fieldIdx) override + { + Rooted parentNode = scope().selectOrThrow( + {&RttiTypes::StructuredEntity, &RttiTypes::AnnotationEntity, + &RttiTypes::DocumentField}); + + std::string fieldName; + DocumentEntity *parent; + bool inField; + + preamble(parentNode, fieldName, parent, inField); + + // retrieve the correct FieldDescriptor. + // TODO: Consider fields of transparent classes + Rooted desc = parent->getDescriptor(); + Rooted field = desc->getFieldDescriptor(fieldName); + if (field == nullptr) { + logger().error( + std::string("Can't handle data because no field with name \"") + + fieldName + "\" exists in descriptor\"" + desc->getName() + + "\".", + location()); + return; + } + if (!field->isPrimitive()) { + logger().error(std::string("Can't handle data because field \"") + + fieldName + "\" of descriptor \"" + + desc->getName() + "\" is not primitive!", + location()); + return; + } + + // try to parse the content. + auto res = VariantReader::parseGenericString( + data, logger(), location().getSourceId(), location().getStart()); + if (!res.first) { + return; + } + // try to convert it to the correct type. + if (!field->getPrimitiveType()->build(res.second, logger())) { + return; + } + // add it as primitive content. + parent->createChildDocumentPrimitive(res.second, fieldName); + } + + static Handler *create(const HandlerData &handlerData) + { + return new DocumentChildHandler{handlerData}; + } +}; + +class TypesystemHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Create the typesystem instance + Rooted typesystem = + project()->createTypesystem(args["name"].asString()); + typesystem->setLocation(location()); + + // Push the typesystem onto the scope, set the POST_HEAD flag to true + scope().push(typesystem); + scope().setFlag(ParserFlag::POST_HEAD, false); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemHandler{handlerData}; + } +}; + +class TypesystemEnumHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Fetch the current typesystem and create the enum node + Rooted typesystem = scope().selectOrThrow(); + Rooted enumType = + typesystem->createEnumType(args["name"].asString()); + enumType->setLocation(location()); + + scope().push(enumType); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemEnumHandler{handlerData}; + } +}; + +class TypesystemEnumEntryHandler : public Handler { +public: + using Handler::Handler; + + std::string entry; + + void start(Variant::mapType &args) override {} + + void end() override + { + Rooted enumType = scope().selectOrThrow(); + enumType->addEntry(entry, logger()); + } + + void data(const std::string &data, int field) override + { + if (field != 0) { + // TODO: This should be stored in the HandlerData + logger().error("Enum entry only has one field."); + return; + } + entry.append(data); + } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemEnumEntryHandler{handlerData}; + } +}; + +class TypesystemStructHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Fetch the arguments used for creating this type + const std::string &name = args["name"].asString(); + const std::string &parent = args["parent"].asString(); + + // Fetch the current typesystem and create the struct node + Rooted typesystem = scope().selectOrThrow(); + Rooted structType = typesystem->createStructType(name); + structType->setLocation(location()); + + // Try to resolve the parent type and set it as parent structure + if (!parent.empty()) { + scope().resolve( + parent, structType, logger(), + [](Handle parent, Handle structType, + Logger &logger) { + if (parent != nullptr) { + structType.cast()->setParentStructure( + parent.cast(), logger); + } + }); + } + scope().push(structType); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemStructHandler{handlerData}; + } +}; + +class TypesystemStructFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Read the argument values + const std::string &name = args["name"].asString(); + const std::string &type = args["type"].asString(); + const Variant &defaultValue = args["default"]; + const bool optional = + !(defaultValue.isObject() && defaultValue.asObject() == nullptr); + + Rooted structType = scope().selectOrThrow(); + Rooted attribute = + structType->createAttribute(name, defaultValue, optional, logger()); + attribute->setLocation(location()); + + // Try to resolve the type and default value + if (optional) { + scope().resolveTypeWithValue( + type, attribute, attribute->getDefaultValue(), logger(), + [](Handle type, Handle attribute, Logger &logger) { + if (type != nullptr) { + attribute.cast()->setType(type.cast(), + logger); + } + }); + } else { + scope().resolveType( + type, attribute, logger(), + [](Handle type, Handle attribute, Logger &logger) { + if (type != nullptr) { + attribute.cast()->setType(type.cast(), + logger); + } + }); + } + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemStructFieldHandler{handlerData}; + } +}; + +class TypesystemConstantHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + // Read the argument values + const std::string &name = args["name"].asString(); + const std::string &type = args["type"].asString(); + const Variant &value = args["value"]; + + Rooted typesystem = scope().selectOrThrow(); + Rooted constant = typesystem->createConstant(name, value); + constant->setLocation(location()); + + // Try to resolve the type + scope().resolveTypeWithValue( + type, constant, constant->getValue(), logger(), + [](Handle type, Handle constant, Logger &logger) { + if (type != nullptr) { + constant.cast()->setType(type.cast(), + logger); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new TypesystemConstantHandler{handlerData}; + } +}; + +/* + * Domain Handlers + */ + +class DomainHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted domain = + project()->createDomain(args["name"].asString()); + domain->setLocation(location()); + + scope().push(domain); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainHandler{handlerData}; + } +}; + +class DomainStructHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + Rooted domain = scope().selectOrThrow(); + + Rooted structuredClass = domain->createStructuredClass( + args["name"].asString(), args["cardinality"].asCardinality(), + nullptr, args["transparent"].asBool(), args["isRoot"].asBool()); + structuredClass->setLocation(location()); + + const std::string &isa = args["isa"].asString(); + if (!isa.empty()) { + scope().resolve( + isa, structuredClass, logger(), + [](Handle superclass, Handle structuredClass, + Logger &logger) { + if (superclass != nullptr) { + structuredClass.cast()->setSuperclass( + superclass.cast(), logger); + } + }); + } + + scope().push(structuredClass); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainStructHandler{handlerData}; + } +}; + +class DomainAnnotationHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + scope().setFlag(ParserFlag::POST_HEAD, true); + + Rooted domain = scope().selectOrThrow(); + + Rooted annotationClass = + domain->createAnnotationClass(args["name"].asString()); + annotationClass->setLocation(location()); + + scope().push(annotationClass); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainAnnotationHandler{handlerData}; + } +}; + +class DomainAttributesHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + // Fetch the current typesystem and create the struct node + Rooted parent = scope().selectOrThrow(); + + Rooted attrDesc = parent->getAttributesDescriptor(); + attrDesc->setLocation(location()); + + scope().push(attrDesc); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainAttributesHandler{handlerData}; + } +}; + +class DomainFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + FieldDescriptor::FieldType type; + if (args["isSubtree"].asBool()) { + type = FieldDescriptor::FieldType::SUBTREE; + } else { + type = FieldDescriptor::FieldType::TREE; + } + + Rooted parent = scope().selectOrThrow(); + + Rooted field = parent->createFieldDescriptor( + type, args["name"].asString(), args["optional"].asBool()); + field->setLocation(location()); + + scope().push(field); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainFieldHandler{handlerData}; + } +}; + +class DomainFieldRefHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parent = scope().selectOrThrow(); + + const std::string &name = args["name"].asString(); + scope().resolve( + name, parent, logger(), + [](Handle field, Handle parent, Logger &logger) { + if (field != nullptr) { + parent.cast()->addFieldDescriptor( + field.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainFieldRefHandler{handlerData}; + } +}; + +class DomainPrimitiveHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parent = scope().selectOrThrow(); + + Rooted field = parent->createPrimitiveFieldDescriptor( + nullptr, args["name"].asString(), args["optional"].asBool()); + field->setLocation(location()); + + const std::string &type = args["type"].asString(); + scope().resolve( + type, field, logger(), + [](Handle type, Handle field, Logger &logger) { + if (type != nullptr) { + field.cast()->setPrimitiveType( + type.cast()); + } + }); + + scope().push(field); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainPrimitiveHandler{handlerData}; + } +}; + +class DomainChildHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted field = + scope().selectOrThrow(); + + const std::string &ref = args["ref"].asString(); + scope().resolve( + ref, field, logger(), + [](Handle child, Handle field, Logger &logger) { + if (child != nullptr) { + field.cast()->addChild( + child.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainChildHandler{handlerData}; + } +}; + +class DomainParent : public Node { +public: + DomainParent(Manager &mgr, std::string name, Handle parent) + : Node(mgr, name, parent) + { + } +}; + +namespace RttiTypes { +const Rtti DomainParent = + RttiBuilder("DomainParent").parent(&Node); +} + +class DomainParentHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted strct = + scope().selectOrThrow(); + + Rooted parent{new DomainParent( + strct->getManager(), args["name"].asString(), strct)}; + parent->setLocation(location()); + scope().push(parent); + } + + void end() override { scope().pop(); } + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentHandler{handlerData}; + } +}; + +class DomainParentFieldHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parentNameNode = + scope().selectOrThrow(); + FieldDescriptor::FieldType type; + if (args["isSubtree"].asBool()) { + type = FieldDescriptor::FieldType::SUBTREE; + } else { + type = FieldDescriptor::FieldType::TREE; + } + + const std::string &name = args["name"].asString(); + const bool optional = args["optional"].asBool(); + Rooted strct = + parentNameNode->getParent().cast(); + + // resolve the parent, create the declared field and add the declared + // StructuredClass as child to it. + scope().resolve( + parentNameNode->getName(), strct, logger(), + [type, name, optional](Handle parent, Handle strct, + Logger &logger) { + if (parent != nullptr) { + Rooted field = + parent.cast()->createFieldDescriptor( + type, name, optional); + field->addChild(strct.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentFieldHandler{handlerData}; + } +}; + +class DomainParentFieldRefHandler : public Handler { +public: + using Handler::Handler; + + void start(Variant::mapType &args) override + { + Rooted parentNameNode = + scope().selectOrThrow(); + + const std::string &name = args["name"].asString(); + Rooted strct = + parentNameNode->getParent().cast(); + auto loc = location(); + + // resolve the parent, get the referenced field and add the declared + // StructuredClass as child to it. + scope().resolve(parentNameNode->getName(), strct, logger(), + [name, loc](Handle parent, + Handle strct, + Logger &logger) { + if (parent != nullptr) { + auto res = parent.cast()->resolve( + &RttiTypes::FieldDescriptor, name); + if (res.size() != 1) { + logger.error( + std::string("Could not find referenced field ") + name, + loc); + return; + } + Rooted field = + res[0].node.cast(); + field->addChild(strct.cast()); + } + }); + } + + void end() override {} + + static Handler *create(const HandlerData &handlerData) + { + return new DomainParentFieldRefHandler{handlerData}; + } +}; + +/* + * Import and Include Handler + */ + +class ImportIncludeHandler : public Handler { +public: + using Handler::Handler; + + bool srcInArgs = false; + std::string rel; + std::string type; + std::string src; + + void start(Variant::mapType &args) override + { + rel = args["rel"].asString(); + type = args["type"].asString(); + src = args["src"].asString(); + srcInArgs = !src.empty(); + } + + void data(const std::string &data, int field) override + { + if (srcInArgs) { + logger().error("\"src\" attribute has already been set"); + return; + } + if (field != 0) { + logger().error("Command has only one field."); + return; + } + src.append(data); + } +}; + +class ImportHandler : public ImportIncludeHandler { +public: + using ImportIncludeHandler::ImportIncludeHandler; + + void start(Variant::mapType &args) override + { + ImportIncludeHandler::start(args); + + // Make sure imports are still possible + if (scope().getFlag(ParserFlag::POST_HEAD)) { + logger().error("Imports must be listed before other commands.", + location()); + return; + } + } + + void end() override + { + // Fetch the last node and check whether an import is valid at this + // position + Rooted leaf = scope().getLeaf(); + if (leaf == nullptr || !leaf->isa(&RttiTypes::RootNode)) { + logger().error( + "Import not supported here, must be inside a document, domain " + "or typesystem command.", + location()); + return; + } + Rooted leafRootNode = leaf.cast(); + + // Perform the actual import, register the imported node within the leaf + // node + Rooted imported = + context().import(src, type, rel, leafRootNode->getReferenceTypes()); + if (imported != nullptr) { + leafRootNode->reference(imported); + } + } + + static Handler *create(const HandlerData &handlerData) + { + return new ImportHandler{handlerData}; + } +}; + +class IncludeHandler : public ImportIncludeHandler { +public: + using ImportIncludeHandler::ImportIncludeHandler; + + void start(Variant::mapType &args) override + { + ImportIncludeHandler::start(args); + } + + void end() override + { + context().include(src, type, rel, {&RttiTypes::Node}); + } + + static Handler *create(const HandlerData &handlerData) + { + return new IncludeHandler{handlerData}; + } +}; + +namespace ParserStates { +/* Document states */ +static const ParserState Document = + ParserStateBuilder() + .parent(&None) + .createdNodeType(&RttiTypes::Document) + .elementHandler(DocumentHandler::create) + .arguments({Argument::String("name", "")}); + +static const ParserState DocumentChild = + ParserStateBuilder() + .parents({&Document, &DocumentChild}) + .createdNodeTypes({&RttiTypes::StructureNode, + &RttiTypes::AnnotationEntity, + &RttiTypes::DocumentField}) + .elementHandler(DocumentChildHandler::create); + +/* Domain states */ +static const ParserState Domain = ParserStateBuilder() + .parents({&None, &Document}) + .createdNodeType(&RttiTypes::Domain) + .elementHandler(DomainHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainStruct = + ParserStateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::StructuredClass) + .elementHandler(DomainStructHandler::create) + .arguments({Argument::String("name"), + Argument::Cardinality("cardinality", Cardinality::any()), + Argument::Bool("isRoot", false), + Argument::Bool("transparent", false), + Argument::String("isa", "")}); + +static const ParserState DomainAnnotation = + ParserStateBuilder() + .parent(&Domain) + .createdNodeType(&RttiTypes::AnnotationClass) + .elementHandler(DomainAnnotationHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainAttributes = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(DomainAttributesHandler::create) + .arguments({}); + +static const ParserState DomainAttribute = + ParserStateBuilder() + .parent(&DomainAttributes) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +static const ParserState DomainField = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +static const ParserState DomainFieldRef = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainFieldRefHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); + +static const ParserState DomainStructPrimitive = + ParserStateBuilder() + .parents({&DomainStruct, &DomainAnnotation}) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainPrimitiveHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("optional", false), + Argument::String("type")}); + +static const ParserState DomainStructChild = + ParserStateBuilder() + .parent(&DomainField) + .elementHandler(DomainChildHandler::create) + .arguments({Argument::String("ref")}); + +static const ParserState DomainStructParent = + ParserStateBuilder() + .parent(&DomainStruct) + .createdNodeType(&RttiTypes::DomainParent) + .elementHandler(DomainParentHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState DomainStructParentField = + ParserStateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME), + Argument::Bool("isSubtree", false), + Argument::Bool("optional", false)}); + +static const ParserState DomainStructParentFieldRef = + ParserStateBuilder() + .parent(&DomainStructParent) + .createdNodeType(&RttiTypes::FieldDescriptor) + .elementHandler(DomainParentFieldRefHandler::create) + .arguments({Argument::String("name", DEFAULT_FIELD_NAME)}); + +/* Typesystem states */ +static const ParserState Typesystem = + ParserStateBuilder() + .parents({&None, &Domain}) + .createdNodeType(&RttiTypes::Typesystem) + .elementHandler(TypesystemHandler::create) + .arguments({Argument::String("name", "")}); + +static const ParserState TypesystemEnum = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::EnumType) + .elementHandler(TypesystemEnumHandler::create) + .arguments({Argument::String("name")}); + +static const ParserState TypesystemEnumEntry = + ParserStateBuilder() + .parent(&TypesystemEnum) + .elementHandler(TypesystemEnumEntryHandler::create) + .arguments({}); + +static const ParserState TypesystemStruct = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::StructType) + .elementHandler(TypesystemStructHandler::create) + .arguments({Argument::String("name"), Argument::String("parent", "")}); + +static const ParserState TypesystemStructField = + ParserStateBuilder() + .parent(&TypesystemStruct) + .elementHandler(TypesystemStructFieldHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("default", Variant::fromObject(nullptr))}); + +static const ParserState TypesystemConstant = + ParserStateBuilder() + .parent(&Typesystem) + .createdNodeType(&RttiTypes::Constant) + .elementHandler(TypesystemConstantHandler::create) + .arguments({Argument::String("name"), Argument::String("type"), + Argument::Any("value")}); + +/* Special states for import and include */ +static const ParserState Import = + ParserStateBuilder() + .parents({&Document, &Typesystem, &Domain}) + .elementHandler(ImportHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); + +static const ParserState Include = + ParserStateBuilder() + .parent(&All) + .elementHandler(IncludeHandler::create) + .arguments({Argument::String("rel", ""), Argument::String("type", ""), + Argument::String("src", "")}); + +static const std::multimap XmlStates{ + {"document", &Document}, + {"*", &DocumentChild}, + {"domain", &Domain}, + {"struct", &DomainStruct}, + {"annotation", &DomainAnnotation}, + {"attributes", &DomainAttributes}, + {"attribute", &DomainAttribute}, + {"field", &DomainField}, + {"fieldRef", &DomainFieldRef}, + {"primitive", &DomainStructPrimitive}, + {"child", &DomainStructChild}, + {"parent", &DomainStructParent}, + {"field", &DomainStructParentField}, + {"fieldRef", &DomainStructParentFieldRef}, + {"typesystem", &Typesystem}, + {"enum", &TypesystemEnum}, + {"entry", &TypesystemEnumEntry}, + {"struct", &TypesystemStruct}, + {"field", &TypesystemStructField}, + {"constant", &TypesystemConstant}, + {"import", &Import}, + {"include", &Include}}; +} + +/** + * Structue containing the private data that is being passed to the + * XML-Handlers. + */ +struct XMLUserData { + /** + * Containing the depth of the current XML file + */ + size_t depth; + + /** + * Reference at the ParserStack instance. + */ + ParserStack *stack; + + /** + * Reference at the CharReader instance. + */ + CharReader *reader; + + /** + * Constructor of the XMLUserData struct. + * + * @param stack is a pointer at the ParserStack instance. + * @param reader is a pointer at the CharReader instance. + */ + XMLUserData(ParserStack *stack, CharReader *reader) + : depth(0), stack(stack), reader(reader) + { + } +}; + +/** + * Wrapper class around the XML_Parser pointer which safely frees it whenever + * the scope is left (e.g. because an exception was thrown). + */ +class ScopedExpatXmlParser { +private: + /** + * Internal pointer to the XML_Parser instance. + */ + XML_Parser parser; + +public: + /** + * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS + * from the expat library. Throws a parser exception if the XML parser + * cannot be initialized. + * + * @param encoding is the protocol-defined encoding passed to expat (or + * nullptr if expat should determine the encoding by itself). + */ + ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) + { + parser = XML_ParserCreate(encoding); + if (!parser) { + throw LoggableException{ + "Internal error: Could not create expat XML parser!"}; + } + } + + /** + * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. + */ + ~ScopedExpatXmlParser() + { + if (parser) { + XML_ParserFree(parser); + parser = nullptr; + } + } + + /** + * Returns the XML_Parser pointer. + */ + XML_Parser operator&() { return parser; } +}; + +/* Adapter Expat -> ParserStack */ + +static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) +{ + // Fetch the parser stack and the associated user data + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + // Fetch the current location in the XML file + size_t offs = XML_GetCurrentByteIndex(p); + + // Build the source location and update the default location of the + // current + // logger instance + SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; + stack->getContext().getLogger().setDefaultLocation(loc); + return loc; +} + +enum class XMLAttributeState { + IN_TAG_NAME, + SEARCH_ATTR, + IN_ATTR_NAME, + HAS_ATTR_NAME, + HAS_ATTR_EQUALS, + IN_ATTR_DATA +}; + +static std::map reconstructXMLAttributeOffsets( + CharReader &reader, SourceLocation location) +{ + std::map res; + + // Fork the reader, we don't want to mess up the XML parsing process, do we? + CharReaderFork readerFork = reader.fork(); + + // Move the read cursor to the start location, abort if this does not work + size_t offs = location.getStart(); + if (!location.isValid() || offs != readerFork.seek(offs)) { + return res; + } + + // Now all we need to do is to implement one half of an XML parser. As this + // is inherently complicated we'll totaly fail at it. Don't care. All we + // want to get is those darn offsets for pretty error messages... (and we + // can assume the XML is valid as it was already read by expat) + XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; + char c; + std::stringstream attrName; + while (readerFork.read(c)) { + // Abort at the end of the tag + if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { + return res; + } + + // One state machine to rule them all, one state machine to find them, + // One state machine to bring them all and in the darkness bind them + // (the byte offsets) + switch (state) { + case XMLAttributeState::IN_TAG_NAME: + if (Utils::isWhitespace(c)) { + state = XMLAttributeState::SEARCH_ATTR; + } + break; + case XMLAttributeState::SEARCH_ATTR: + if (!Utils::isWhitespace(c)) { + state = XMLAttributeState::IN_ATTR_NAME; + attrName << c; + } + break; + case XMLAttributeState::IN_ATTR_NAME: + if (Utils::isWhitespace(c)) { + state = XMLAttributeState::HAS_ATTR_NAME; + } else if (c == '=') { + state = XMLAttributeState::HAS_ATTR_EQUALS; + } else { + attrName << c; + } + break; + case XMLAttributeState::HAS_ATTR_NAME: + if (!Utils::isWhitespace(c)) { + if (c == '=') { + state = XMLAttributeState::HAS_ATTR_EQUALS; + break; + } + // Well, this is a strange XML file... We expected to + // see a '=' here! Try to continue with the + // "HAS_ATTR_EQUALS" state as this state will hopefully + // inlcude some error recovery + } else { + // Skip whitespace here + break; + } + // Fallthrough + case XMLAttributeState::HAS_ATTR_EQUALS: + if (!Utils::isWhitespace(c)) { + if (c == '"') { + // Here we are! We have found the beginning of an + // attribute. Let's quickly lock the current offset away + // in the result map + res.emplace(attrName.str(), + SourceLocation{reader.getSourceId(), + readerFork.getOffset()}); + attrName.str(std::string{}); + state = XMLAttributeState::IN_ATTR_DATA; + } else { + // No, this XML file is not well formed. Assume we're in + // an attribute name once again + attrName.str(std::string{&c, 1}); + state = XMLAttributeState::IN_ATTR_NAME; + } + } + break; + case XMLAttributeState::IN_ATTR_DATA: + if (c == '"') { + // We're at the end of the attribute data, start anew + state = XMLAttributeState::SEARCH_ATTR; + } + break; + } + } + return res; +} + +static void xmlStartElementHandler(void *p, const XML_Char *name, + const XML_Char **attrs) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + SourceLocation loc = syncLoggerPosition(parser); + + // Read the argument locations -- this is only a stupid and slow hack, + // but it is necessary, as expat doesn't give use the byte offset of the + // arguments. + std::map offs = + reconstructXMLAttributeOffsets(*userData->reader, loc); + + // Assemble the arguments + Variant::mapType args; + + const XML_Char **attr = attrs; + while (*attr) { + // Convert the C string to a std::string + const std::string key{*(attr++)}; + + // Search the location of the key + SourceLocation keyLoc; + auto it = offs.find(key); + if (it != offs.end()) { + keyLoc = it->second; + } + + // Parse the string, pass the location of the key + std::pair value = VariantReader::parseGenericString( + *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), + keyLoc.getStart()); + args.emplace(key, value.second); + } + + // Call the start function + std::string nameStr(name); + if (nameStr != "ousia" || userData->depth > 0) { + stack->start(std::string(name), args, loc); + } + + // Increment the current depth + userData->depth++; +} + +static void xmlEndElementHandler(void *p, const XML_Char *name) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + syncLoggerPosition(parser); + + // Decrement the current depth + userData->depth--; + + // Call the end function + std::string nameStr(name); + if (nameStr != "ousia" || userData->depth > 0) { + stack->end(); + } +} + +static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) +{ + XML_Parser parser = static_cast(p); + XMLUserData *userData = static_cast(XML_GetUserData(p)); + ParserStack *stack = userData->stack; + + size_t ulen = len > 0 ? static_cast(len) : 0; + syncLoggerPosition(parser, ulen); + const std::string data = Utils::trim(std::string{s, ulen}); + if (!data.empty()) { + stack->data(data); + } +} + +/* Class XmlParser */ + +void XmlParser::doParse(CharReader &reader, ParserContext &ctx) +{ + // Create the parser object + ScopedExpatXmlParser p{"UTF-8"}; + + // Create the parser stack instance, if we're starting on a non-empty scope, + // try to deduce the parser state + ParserStack stack(ctx, ParserStates::XmlStates); + if (!ctx.getScope().isEmpty()) { + if (!stack.deduceState()) { + return; + } + } + + // Pass the reference to the ParserStack to the XML handler + XMLUserData data(&stack, &reader); + XML_SetUserData(&p, &data); + XML_UseParserAsHandlerArg(&p); + + // Set the callback functions + XML_SetStartElementHandler(&p, xmlStartElementHandler); + XML_SetEndElementHandler(&p, xmlEndElementHandler); + XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); + + // Feed data into expat while there is data to process + constexpr size_t BUFFER_SIZE = 64 * 1024; + while (true) { + // Fetch a buffer from expat for the input data + char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); + if (!buf) { + throw LoggableException{ + "Internal error: XML parser out of memory!"}; + } + + // Read into the buffer + size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); + + // Parse the data and handle any XML error + if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { + // Fetch the xml parser byte offset + size_t offs = XML_GetCurrentByteIndex(&p); + + // Throw a corresponding exception + XML_Error code = XML_GetErrorCode(&p); + std::string msg = std::string{XML_ErrorString(code)}; + throw LoggableException{"XML: " + msg, + SourceLocation{ctx.getSourceId(), offs}}; + } + + // Abort once there are no more bytes in the stream + if (bytesRead == 0) { + break; + } + } +} +} + diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp new file mode 100644 index 0000000..c8b6302 --- /dev/null +++ b/src/formats/osxml/OsxmlParser.hpp @@ -0,0 +1,55 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file XmlParser.hpp + * + * Contains the parser responsible for reading Ousía XML Documents (extension + * oxd) and Ousía XML Modules (extension oxm). + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_XML_PARSER_HPP_ +#define _OUSIA_XML_PARSER_HPP_ + +#include + +namespace ousia { + +/** + * The XmlParser class implements parsing the various types of Ousía XML + * documents using the expat stream XML parser. + */ +class XmlParser : public Parser { +protected: + /** + * Parses the given input stream as XML file and returns the parsed + * top-level node. + * + * @param reader is the CharReader from which the input should be read. + * @param ctx is a reference to the ParserContext instance that should be + * used. + */ + void doParse(CharReader &reader, ParserContext &ctx) override; +}; + +} + +#endif /* _OUSIA_XML_PARSER_HPP_ */ + -- cgit v1.2.3 From 98f43328e566b3a77b75808892246a295adb0eb0 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sat, 14 Feb 2015 23:59:43 +0100 Subject: Renamed osdm to osml and osdmx to osxml --- src/formats/osml/OsmlStreamParser.cpp | 32 +- src/formats/osml/OsmlStreamParser.hpp | 22 +- src/formats/osxml/OsxmlEventParser.cpp | 524 ++++++++++++++++ src/formats/osxml/OsxmlEventParser.hpp | 205 ++++++ src/formats/osxml/OsxmlParser.cpp | 337 ---------- test/formats/osdm/OsdmStreamParserTest.cpp | 973 ----------------------------- test/formats/osdmx/OsdmxParserTest.cpp | 314 ---------- 7 files changed, 756 insertions(+), 1651 deletions(-) create mode 100644 src/formats/osxml/OsxmlEventParser.cpp create mode 100644 src/formats/osxml/OsxmlEventParser.hpp delete mode 100644 test/formats/osdm/OsdmStreamParserTest.cpp delete mode 100644 test/formats/osdmx/OsdmxParserTest.cpp (limited to 'src/formats') diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 6a55f12..6b00eef 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -21,7 +21,7 @@ #include #include -#include "OsdmStreamParser.hpp" +#include "OsmlStreamParser.hpp" namespace ousia { @@ -160,14 +160,14 @@ public: } }; -OsdmStreamParser::OsdmStreamParser(CharReader &reader, Logger &logger) +OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) : reader(reader), logger(logger), tokenizer(Tokens) { // Place an intial command representing the complete file on the stack commands.push(Command{"", Variant::mapType{}, true, true, true}); } -Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) +Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) { bool first = true; bool hasCharSiceNSSep = false; @@ -210,7 +210,7 @@ Variant OsdmStreamParser::parseIdentifier(size_t start, bool allowNSSep) return res; } -OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() +OsmlStreamParser::State OsmlStreamParser::parseBeginCommand() { // Expect a '{' after the command reader.consumeWhitespace(); @@ -251,7 +251,7 @@ OsdmStreamParser::State OsdmStreamParser::parseBeginCommand() return State::COMMAND; } -static bool checkStillInField(const OsdmStreamParser::Command &cmd, +static bool checkStillInField(const OsmlStreamParser::Command &cmd, const Variant &endName, Logger &logger) { if (cmd.inField && !cmd.inRangeField) { @@ -264,7 +264,7 @@ static bool checkStillInField(const OsdmStreamParser::Command &cmd, return false; } -OsdmStreamParser::State OsdmStreamParser::parseEndCommand() +OsmlStreamParser::State OsmlStreamParser::parseEndCommand() { // Expect a '{' after the command if (!reader.expect('{')) { @@ -327,7 +327,7 @@ OsdmStreamParser::State OsdmStreamParser::parseEndCommand() return cmd.inRangeField ? State::FIELD_END : State::NONE; } -Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) +Variant OsmlStreamParser::parseCommandArguments(Variant commandArgName) { // Parse the arguments using the universal VariantReader Variant commandArguments; @@ -353,7 +353,7 @@ Variant OsdmStreamParser::parseCommandArguments(Variant commandArgName) return commandArguments; } -void OsdmStreamParser::pushCommand(Variant commandName, +void OsmlStreamParser::pushCommand(Variant commandName, Variant commandArguments, bool hasRange) { // Store the location on the stack @@ -368,7 +368,7 @@ void OsdmStreamParser::pushCommand(Variant commandName, hasRange, false, false}); } -OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) +OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) { // Parse the commandName as a first identifier Variant commandName = parseIdentifier(start, true); @@ -416,7 +416,7 @@ OsdmStreamParser::State OsdmStreamParser::parseCommand(size_t start) return State::COMMAND; } -void OsdmStreamParser::parseBlockComment() +void OsmlStreamParser::parseBlockComment() { Token token; size_t depth = 1; @@ -436,7 +436,7 @@ void OsdmStreamParser::parseBlockComment() logger.error("File ended while being in a block comment", reader); } -void OsdmStreamParser::parseLineComment() +void OsmlStreamParser::parseLineComment() { char c; while (reader.read(c)) { @@ -446,7 +446,7 @@ void OsdmStreamParser::parseLineComment() } } -bool OsdmStreamParser::checkIssueData(DataHandler &handler) +bool OsmlStreamParser::checkIssueData(DataHandler &handler) { if (!handler.isEmpty()) { data = handler.toVariant(reader.getSourceId()); @@ -457,7 +457,7 @@ bool OsdmStreamParser::checkIssueData(DataHandler &handler) return false; } -bool OsdmStreamParser::checkIssueFieldStart() +bool OsmlStreamParser::checkIssueFieldStart() { // Fetch the current command, and check whether we're currently inside a // field of this command @@ -482,7 +482,7 @@ bool OsdmStreamParser::checkIssueFieldStart() return false; } -OsdmStreamParser::State OsdmStreamParser::parse() +OsmlStreamParser::State OsmlStreamParser::parse() { // Handler for incomming data DataHandler handler; @@ -627,12 +627,12 @@ OsdmStreamParser::State OsdmStreamParser::parse() return State::END; } -const Variant &OsdmStreamParser::getCommandName() +const Variant &OsmlStreamParser::getCommandName() { return commands.top().name; } -const Variant &OsdmStreamParser::getCommandArguments() +const Variant &OsmlStreamParser::getCommandArguments() { return commands.top().arguments; } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 84674c0..1508012 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -17,17 +17,17 @@ */ /** - * @file OsdmStreamParser.hpp + * @file OsmlStreamParser.hpp * - * Provides classes for low-level classes for reading the TeX-esque osdm + * Provides classes for low-level classes for reading the TeX-esque osml * format. The class provided here does not build any model objects and does not * implement the Parser interface. * * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_OSDM_STREAM_PARSER_HPP_ -#define _OUSIA_OSDM_STREAM_PARSER_HPP_ +#ifndef _OUSIA_OSML_STREAM_PARSER_HPP_ +#define _OUSIA_OSML_STREAM_PARSER_HPP_ #include @@ -42,7 +42,7 @@ class Logger; class DataHandler; /** - * The OsdmStreamParser class provides a low-level reader for the TeX-esque osdm + * The OsmlStreamParser class provides a low-level reader for the TeX-esque osml * format. The parser is constructed around a "parse" function, which reads data * from the underlying CharReader until a new state is reached and indicates * this state in a return value. The calling code then has to pull corresponding @@ -52,10 +52,10 @@ class DataHandler; * fields, as this would lead to too many consecutive errors) a * LoggableException is thrown. */ -class OsdmStreamParser { +class OsmlStreamParser { public: /** - * Enum used to indicate which state the OsdmStreamParser class is in + * Enum used to indicate which state the OsmlStreamParser class is in * after calling the "parse" function. */ enum class State { @@ -291,14 +291,14 @@ private: public: /** - * Constructor of the OsdmStreamParser class. Attaches the new - * OsdmStreamParser to the given CharReader and Logger instances. + * Constructor of the OsmlStreamParser class. Attaches the new + * OsmlStreamParser to the given CharReader and Logger instances. * * @param reader is the reader instance from which incomming characters * should be read. * @param logger is the logger instance to which errors should be written. */ - OsdmStreamParser(CharReader &reader, Logger &logger); + OsmlStreamParser(CharReader &reader, Logger &logger); /** * Continues parsing. Returns one of the states defined in the State enum. @@ -346,5 +346,5 @@ public: }; } -#endif /* _OUSIA_OSDM_STREAM_PARSER_HPP_ */ +#endif /* _OUSIA_OSML_STREAM_PARSER_HPP_ */ diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp new file mode 100644 index 0000000..2ef170e --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -0,0 +1,524 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include +#include + +#include "OsxmlEventParser.hpp" + +namespace ousia { + +/** + * Class containing data used by the internal functions. + */ +class OsxmlEventParserData { +public: + /** + * Contains the current depth of the parsing process. + */ + ssize_t depth; + + /** + * Set to a value larger or equal to zero if the parser is currently inside + * an annotation end tag -- the value represents the depth in which the + * tag was opened. + */ + ssize_t annotationEndTagDepth; + + /** + * Default constructor. + */ + OsxmlEventParserData() : depth(0), annotationEndTagDepth(-1) {} + + /** + * Increments the depth. + */ + void incrDepth() { depth++; } + + /** + * Decrement the depth and reset the annotationEndTagDepth flag. + */ + void decrDepth() + { + if (depth > 0) { + depth--; + } + if (depth < annotationEndTagDepth) { + annotationEndTagDepth = -1; + } + } + + /** + * Returns true if we're currently inside an end tag. + */ + bool inAnnotationEndTag() { depth >= annotationEndTagDepth; } +}; + +namespace { +/** + * Wrapper class around the XML_Parser pointer which safely frees it whenever + * the scope is left (e.g. because an exception was thrown). + */ +class ScopedExpatXmlParser { +private: + /** + * Internal pointer to the XML_Parser instance. + */ + XML_Parser parser; + +public: + /** + * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS + * from the expat library. Throws a parser exception if the XML parser + * cannot be initialized. + * + * @param encoding is the protocol-defined encoding passed to expat (or + * nullptr if expat should determine the encoding by itself). + */ + ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) + { + parser = XML_ParserCreate(encoding); + if (!parser) { + throw LoggableException{ + "Internal error: Could not create expat XML parser!"}; + } + } + + /** + * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. + */ + ~ScopedExpatXmlParser() + { + if (parser) { + XML_ParserFree(parser); + parser = nullptr; + } + } + + /** + * Returns the XML_Parser pointer. + */ + XML_Parser operator&() { return parser; } +}; + +/** + * Enum used internally in the statemachine of the micro-xml argument parser. + */ +enum class XmlAttributeState { + IN_TAG_NAME, + SEARCH_ATTR, + IN_ATTR_NAME, + HAS_ATTR_NAME, + HAS_ATTR_EQUALS, + IN_ATTR_DATA +}; + +/** + * Function used to reconstruct the location of the attributes of a XML tag in + * the source code. This is necessary, as the xml parser only returns an offset + * to the begining of a tag and not to the position of the individual arguments. + * + * @param reader is the char reader from which the character data should be + * read. + * @param offs is a byte offset in the xml file pointing at the "<" character of + * the tag. + * @return a map from attribute keys to the corresponding location (including + * range) of the atribute. Also contains the location of the tagname in the + * form of the virtual attribute "$tag". + */ +static std::map xmlReconstructAttributeOffsets( + CharReader &reader, size_t offs) +{ + std::map res; + + // Fork the reader, we don't want to mess up the XML parsing process, do we? + CharReaderFork readerFork = reader.fork(); + + // Move the read cursor to the start location, abort if this does not work + if (!location.isValid() || offs != readerFork.seek(offs)) { + return res; + } + + // Now all we need to do is to implement one half of an XML parser. As this + // is inherently complicated we'll totaly fail at it. Don't care. All we + // want to get is those darn offsets for pretty error messages... (and we + // can assume the XML is valid as it was already read by expat) + XmlAttributeState state = XmlAttributeState::IN_TAG_NAME; + char c; + std::stringstream attrName; + while (readerFork.read(c)) { + // Abort at the end of the tag + if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) { + return res; + } + + // One state machine to rule them all, one state machine to find them, + // One state machine to bring them all and in the darkness bind them + // (the byte offsets) + switch (state) { + case XmlAttributeState::IN_TAG_NAME: + if (Utils::isWhitespace(c)) { + res.emplace("$tag", + SourceLocation{reader.getSourceId(), offs + 1, + readerFork.getOffset() - 1}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + case XmlAttributeState::SEARCH_ATTR: + if (!Utils::isWhitespace(c)) { + state = XmlAttributeState::IN_ATTR_NAME; + attrName << c; + } + break; + case XmlAttributeState::IN_ATTR_NAME: + if (Utils::isWhitespace(c)) { + state = XmlAttributeState::HAS_ATTR_NAME; + } else if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + } else { + attrName << c; + } + break; + case XmlAttributeState::HAS_ATTR_NAME: + if (!Utils::isWhitespace(c)) { + if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + break; + } + // Well, this is a strange XML file... We expected to + // see a '=' here! Try to continue with the + // "HAS_ATTR_EQUALS" state as this state will hopefully + // inlcude some error recovery + } else { + // Skip whitespace here + break; + } + // Fallthrough + case XmlAttributeState::HAS_ATTR_EQUALS: + if (!Utils::isWhitespace(c)) { + if (c == '"') { + // Here we are! We have found the beginning of an + // attribute. Let's quickly lock the current offset away + // in the result map + res.emplace(attrName.str(), + SourceLocation{reader.getSourceId(), + readerFork.getOffset()}); + state = XmlAttributeState::IN_ATTR_DATA; + } else { + // No, this XML file is not well formed. Assume we're in + // an attribute name once again + attrName.str(std::string{&c, 1}); + state = XmlAttributeState::IN_ATTR_NAME; + } + } + break; + case XmlAttributeState::IN_ATTR_DATA: + if (c == '"') { + // We're at the end of the attribute data, set the end + // location + auto it = res.find(attrName.str()); + if (it != res.end()) { + it->second.setEnd(readerFork.getOffset() - 1); + } + + // Reset the attribute name and restart the search + attrName.str(std::string{}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + } + } + return res; +} + +/** + * Synchronizes the position of the xml parser with the default location of the + * logger instance. + * + * @param p is a pointer at the xml parser instance. + * @param len is the length of the string that should be refered to. + * @return the SourceLocation that has been set in the logger. + */ +static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0) +{ + // Fetch the OsxmlEventParser instance + OsxmlEventParser *parser = + static_cast(XML_GetUserData(p)); + + // Fetch the current location in the XML file and set the default location + // in the logger + size_t offs = XML_GetCurrentByteIndex(p); + SourceLocation loc = + SourceLocation{parser->getReader().getSourceId(), offs, offs + len}; + parser->getLogger().setDefaultLocation(location); + + // Return the fetched location + return loc; +} + +/** + * Prefix used to indicate the start of an annoation, + */ +static const std::string ANNOTATION_START_PREFIX{"a:start:"}; + +/** + * Prefix used to indicate the end of an annotation. + */ +static const std::string ANNOTATION_END_PREFIX{"a:end"}; + +/** + * Callback called by eXpat whenever a start handler is reached. + */ +static void xmlStartElementHandler(void *ref, const XML_Char *name, + const XML_Char **attrs) +{ + // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser + XML_Parser p = static_cast(ref); + OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + + // Read the argument locations -- this is only a stupid and slow hack, + // but it is necessary, as expat doesn't give use the byte offset of the + // arguments. + std::map attributeOffsets = + xmlReconstructXMLAttributeOffsets(*userData->reader, + XML_GetCurrentByteIndex(p)); + + // Update the logger position + SourceLocation loc = xmlSyncLoggerPosition(p); + + // Fetch the location of the name + SourceLocation nameLoc = loc; + auto it = attributeOffsets.find("$tag"); + if (it != attributeOffsets.end()) { + nameLoc = it->second; + } + // Increment the current depth + parser->getData().incrDepth(); + + // Make sure we're currently not inside an annotation end tag -- this would + // be highly illegal! + if (parser->getData().inAnnotationEndTag()) { + logger.error("No tags allowed inside an annotation end tag", nameLoc); + return; + } + + // Assemble the arguments + Variant::mapType args; + const XML_Char **attr = attrs; + while (*attr) { + // Convert the C string to a std::string + const std::string key{*(attr++)}; + + // Search the location of the key + SourceLocation keyLoc; + auto it = attributeOffsets.find(key); + if (it != attributeOffsets.end()) { + keyLoc = it->second; + } + + // Parse the string, pass the location of the key + std::pair value = VariantReader::parseGenericString( + *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), + keyLoc.getStart()); + + // Set the overall location of the parsed element to the attribute + // location + value.second->setLocation(keyLoc); + + // Store the + if (!args.emplace(key, value.second).second) { + parser->getLogger().warning( + std::string("Attribute \"") + key + + "\" defined multiple times, only using first definition", + keyLoc); + } + } + + // Fetch the name of the tag, check for special tags + std::string nameStr(name); + if (nameStr == "ousia" && parser->getData().depth == 1) { + // We're in the top-level and the magic "ousia" tag is reached -- just + // ignore it and issue a warning for each argument that has been given + for (const auto &arg : args) { + parser->getLogger().warning( + std::string("Ignoring attribute \"") + arg.first + + std::string("\" for magic tag \"ousia\""), + arg.second); + } + } else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) { + // Assemble a name variant containing the name minus the prefix + Variant nameVar = nameStr.substr(ANNOTATION_START_PREFIX.size()); + nameVar.setLocation(nameLoc); + + // Issue the "annotationStart" event + parser->getEvents().annotationStart(nameVar, args); + } else if (Utils::startsWith(nameStr, ANNOTATION_END_PREFIX)) { + // Assemble a name variant containing the name minus the prefix + nameStr = nameStr.substr(ANNOTATION_END_PREFIX.size()); + + // Discard a potentially leading colon + if (!nameStr.empty() && nameStr[0] == ':') { + nameStr = nameStr.substr(1); + } + + // Assemble the variant containing the name and its location + Variant nameVar = Variant::fromString(nameStr); + nameVar.setLocation(nameLoc); + + // Check whether a "name" attribute was given + Variant elementName; + for (const auto &arg : args) { + if (arg.first == "name") { + elementName = arg.second; + } else { + parser->getLogger().warning( + std::string("Ignoring attribute \"") + arg.first + + "\" in annotation end tag", + arg.second); + } + } + + // Set the annotationEndTagDepth to disallow any further tags to be + // opened inside the annotation end tag. + parser->getData().annotationEndTagDepth = parser->getData().depth; + + // Issue the "annotationEnd" event + parser->getEvents().annotationEnd(nameVar, args); + } else { + // Just issue a "commandStart" event in any other case + Variant nameVar = Variant::fromString(nameStr); + nameVar.setLocation(nameLoc); + parser->getEvents().commandStart(nameVar, args); + } +} + +static void xmlEndElementHandler(void *p, const XML_Char *name) +{ + // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser + XML_Parser p = static_cast(ref); + OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + + // Synchronize the position of the logger with teh position + xmlSyncLoggerPosition(parser); + + // Decrement the current depth + parser->getData().decrDepth(); + + // Abort as long as we're in an annotation end tag + if (parser->getData().inAnnotationEndTag()) { + return; + } + + // Abort if the special ousia tag ends here + if (nameStr == "ousia" && parser->getData().depth == 0) { + return; + } + + // Issue the "fieldEnd" event + parser->getEvents().fieldEnd(); +} + +static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) +{ + // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser + XML_Parser p = static_cast(ref); + OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + + // TODO +/* size_t ulen = len > 0 ? static_cast(len) : 0; + syncLoggerPosition(parser, ulen); + const std::string data = Utils::trim(std::string{s, ulen}); + if (!data.empty()) { + stack->data(data); + }*/ +} +} + +/* Class OsxmlEventParser */ + +OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, + Logger &logger) + : reader(reader), + events(events), + logger(logger), + whitespaceMode(WhitespaceMode::COLLAPSE), + data(new OsxmlEventParserData()) +{ +} + +void OsxmlEventParser::parse(CharReader &reader) +{ + // Create the parser object + ScopedExpatXmlParser p{"UTF-8"}; + + // Reset the depth + depth = 0; + + // Pass the reference to the ParserStack to the XML handler + XMLUserData data(&stack, &reader); + XML_SetUserData(&p, this); + XML_UseParserAsHandlerArg(&p); + + // Set the callback functions + XML_SetStartElementHandler(&p, xmlStartElementHandler); + XML_SetEndElementHandler(&p, xmlEndElementHandler); + XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); + + // Feed data into expat while there is data to process + constexpr size_t BUFFER_SIZE = 64 * 1024; + while (true) { + // Fetch a buffer from expat for the input data + char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); + if (!buf) { + throw OusiaException{"Internal error: XML parser out of memory!"}; + } + + // Read into the buffer + size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); + + // Parse the data and handle any XML error as exception + if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { + throw LoggableException{ + "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))}, + xmlSyncLoggerPosition(p)}; + } + + // Abort once there are no more bytes in the stream + if (bytesRead == 0) { + break; + } + } +} + +void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) +{ + this->whitespaceMode = whitespaceMode; +} + +CharReader &OsxmlEventParser::getCharReader() { return charReader; } + +Logger &OsxmlEventParser::getLogger() { return logger; } + +OsxmlEvents &OsxmlEventParser::getEvents() { return events; } + +OsxmlEventParserData &OsxmlEventParser::getData() { return *data; } +} + diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp new file mode 100644 index 0000000..5319ca6 --- /dev/null +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -0,0 +1,205 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file OsxmlEventParser.hpp + * + * The OsxmlEventParser class is responsible for parsing an XML file and calling + * the corresponding event handler functions if an XML item is found. Event + * handling is performed using a listener interface. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OSXML_EVENT_PARSER_HPP_ +#define _OSXML_EVENT_PARSER_HPP_ + +#include +#include + +#include + +namespace ousia { + +// Forward declarations +class Logger; +class Variant; +class OsxmlEventParserData; + +/** + * Interface which defines the callback functions which are called by the + * OsxmlEventParser whenever an event occurs. + */ +class OsxmlEvents { +public: + /** + * Virtual destructor. + */ + virtual ~OsxmlEvents() {} + + /** + * Called whenever a command starts. Note that this implicitly always starts + * the default field of the command. + * + * @param name is a string variant containing name and location of the + * command. + * @param args is a map variant containing the arguments that were given + * to the command. + */ + virtual void commandStart(Variant name, Variant args) = 0; + + /** + * Called whenever an annotation starts. Note that this implicitly always + * starts the default field of the annotation. + * + * @param name is a string variant containing the name of the annotation + * class and the location of the annotation definition. + * @param args is a map variant containing the arguments that were given + * to the annotation definition. + */ + virtual void annotationStart(Variant name, Variant args); + + /** + * Called whenever the range of an annotation ends. The callee must + * disambiguate the actual annotation that is finished here. + * + * @param name is a string variant containing the name of the annotation + * class that should end here. May be empty (or nullptr), if no elementName + * has been specified at the end of the annotation. + * @param elementName is the name of the annotation element that should be + * ended here. May be empty (or nullptr), if no elementName has been + * specified at the end of the annotation. + */ + virtual void annotationEnd(Variant name, Variant elementName); + + /** + * Called whenever the default field which was implicitly started by + * commandStart or annotationStart ends. Note that this does not end the + * range of an annotation, but the default field of the annotation. To + * signal the end of the annotation this, the annotationEnd method will be + * invoked. + */ + virtual void fieldEnd() = 0; + + /** + * Called whenever data is found. Whitespace data is handled as specified + * and the data has been parsed to the specified variant type. This function + * is not called if the parsing failed, the parser prints an error message + * instead. + * + * @param data is the already parsed data that should be passed to the + * handler. + */ + virtual void data(Variant data) = 0; + +}; + +/** + * The OsxmlEventParser class is a wrapper around eXpat which implements the + * specialities of the osxml formats class (like annotation ranges). It notifies + * a specified event handler whenever a command, annotation or data has been + * reached. + */ +class OsxmlEventParser { +private: + /** + * Reference at the internal CharReader instance. + */ + CharReader &reader; + + /** + * Set of callback functions to be called whenever an event is triggered. + */ + OsxmlEvents &events; + + /** + * Reference at the Logger object to which error messages or warnings should + * be logged. + */ + Logger &logger; + + /** + * Current whitespace mode. + */ + WhitespaceMode whitespaceMode; + + /** + * Data to be used by the internal functions. + */ + std::unique_ptr data; + +public: + /** + * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents + * of which the callback functions are called. + * + * @param reader is a reference to the CharReader instance from which the + * XML should be read. + * @param events is a refence at an instance of the OsxmlEvents class. All + * events are forwarded to this class. + * @param logger is the Logger instance to which log messages should be + * written. + */ + OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger); + + /** + * Performs the actual parsing. Reads the XML using eXpat and calles the + * callbacks in the event listener instance whenever something interesting + * happens. + */ + void parse(); + + /** + * Sets the whitespace handling mode. + * + * @param whitespaceMode defines how whitespace in the data should be + * handled. + */ + void setWhitespaceMode(WhitespaceMode whitespaceMode); + + /** + * Returns the internal CharReader reference. + * + * @return the CharReader reference. + */ + CharReader &getCharReader(); + + /** + * Returns the internal Logger reference. + * + * @return the internal Logger reference. + */ + Logger &getLogger(); + + /** + * Returns the internal OsxmlEvents reference. + * + * @return the internal OsxmlEvents reference. + */ + OsxmlEvents &getEvents(); + + /** + * Returns a reference at the internal data. + */ + OsxmlEventParserData &getData(); +}; + +} + +#endif /* _OSXML_EVENT_PARSER_HPP_ */ + diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index c46d9de..4f6503c 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -1093,343 +1093,6 @@ static const std::multimap XmlStates{ {"include", &Include}}; } -/** - * Structue containing the private data that is being passed to the - * XML-Handlers. - */ -struct XMLUserData { - /** - * Containing the depth of the current XML file - */ - size_t depth; - - /** - * Reference at the ParserStack instance. - */ - ParserStack *stack; - - /** - * Reference at the CharReader instance. - */ - CharReader *reader; - - /** - * Constructor of the XMLUserData struct. - * - * @param stack is a pointer at the ParserStack instance. - * @param reader is a pointer at the CharReader instance. - */ - XMLUserData(ParserStack *stack, CharReader *reader) - : depth(0), stack(stack), reader(reader) - { - } -}; - -/** - * Wrapper class around the XML_Parser pointer which safely frees it whenever - * the scope is left (e.g. because an exception was thrown). - */ -class ScopedExpatXmlParser { -private: - /** - * Internal pointer to the XML_Parser instance. - */ - XML_Parser parser; - -public: - /** - * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS - * from the expat library. Throws a parser exception if the XML parser - * cannot be initialized. - * - * @param encoding is the protocol-defined encoding passed to expat (or - * nullptr if expat should determine the encoding by itself). - */ - ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) - { - parser = XML_ParserCreate(encoding); - if (!parser) { - throw LoggableException{ - "Internal error: Could not create expat XML parser!"}; - } - } - - /** - * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. - */ - ~ScopedExpatXmlParser() - { - if (parser) { - XML_ParserFree(parser); - parser = nullptr; - } - } - - /** - * Returns the XML_Parser pointer. - */ - XML_Parser operator&() { return parser; } -}; - -/* Adapter Expat -> ParserStack */ - -static SourceLocation syncLoggerPosition(XML_Parser p, size_t len = 0) -{ - // Fetch the parser stack and the associated user data - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - // Fetch the current location in the XML file - size_t offs = XML_GetCurrentByteIndex(p); - - // Build the source location and update the default location of the - // current - // logger instance - SourceLocation loc{stack->getContext().getSourceId(), offs, offs + len}; - stack->getContext().getLogger().setDefaultLocation(loc); - return loc; -} - -enum class XMLAttributeState { - IN_TAG_NAME, - SEARCH_ATTR, - IN_ATTR_NAME, - HAS_ATTR_NAME, - HAS_ATTR_EQUALS, - IN_ATTR_DATA -}; - -static std::map reconstructXMLAttributeOffsets( - CharReader &reader, SourceLocation location) -{ - std::map res; - - // Fork the reader, we don't want to mess up the XML parsing process, do we? - CharReaderFork readerFork = reader.fork(); - - // Move the read cursor to the start location, abort if this does not work - size_t offs = location.getStart(); - if (!location.isValid() || offs != readerFork.seek(offs)) { - return res; - } - - // Now all we need to do is to implement one half of an XML parser. As this - // is inherently complicated we'll totaly fail at it. Don't care. All we - // want to get is those darn offsets for pretty error messages... (and we - // can assume the XML is valid as it was already read by expat) - XMLAttributeState state = XMLAttributeState::IN_TAG_NAME; - char c; - std::stringstream attrName; - while (readerFork.read(c)) { - // Abort at the end of the tag - if (c == '>' && state != XMLAttributeState::IN_ATTR_DATA) { - return res; - } - - // One state machine to rule them all, one state machine to find them, - // One state machine to bring them all and in the darkness bind them - // (the byte offsets) - switch (state) { - case XMLAttributeState::IN_TAG_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::SEARCH_ATTR; - } - break; - case XMLAttributeState::SEARCH_ATTR: - if (!Utils::isWhitespace(c)) { - state = XMLAttributeState::IN_ATTR_NAME; - attrName << c; - } - break; - case XMLAttributeState::IN_ATTR_NAME: - if (Utils::isWhitespace(c)) { - state = XMLAttributeState::HAS_ATTR_NAME; - } else if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - } else { - attrName << c; - } - break; - case XMLAttributeState::HAS_ATTR_NAME: - if (!Utils::isWhitespace(c)) { - if (c == '=') { - state = XMLAttributeState::HAS_ATTR_EQUALS; - break; - } - // Well, this is a strange XML file... We expected to - // see a '=' here! Try to continue with the - // "HAS_ATTR_EQUALS" state as this state will hopefully - // inlcude some error recovery - } else { - // Skip whitespace here - break; - } - // Fallthrough - case XMLAttributeState::HAS_ATTR_EQUALS: - if (!Utils::isWhitespace(c)) { - if (c == '"') { - // Here we are! We have found the beginning of an - // attribute. Let's quickly lock the current offset away - // in the result map - res.emplace(attrName.str(), - SourceLocation{reader.getSourceId(), - readerFork.getOffset()}); - attrName.str(std::string{}); - state = XMLAttributeState::IN_ATTR_DATA; - } else { - // No, this XML file is not well formed. Assume we're in - // an attribute name once again - attrName.str(std::string{&c, 1}); - state = XMLAttributeState::IN_ATTR_NAME; - } - } - break; - case XMLAttributeState::IN_ATTR_DATA: - if (c == '"') { - // We're at the end of the attribute data, start anew - state = XMLAttributeState::SEARCH_ATTR; - } - break; - } - } - return res; -} - -static void xmlStartElementHandler(void *p, const XML_Char *name, - const XML_Char **attrs) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - SourceLocation loc = syncLoggerPosition(parser); - - // Read the argument locations -- this is only a stupid and slow hack, - // but it is necessary, as expat doesn't give use the byte offset of the - // arguments. - std::map offs = - reconstructXMLAttributeOffsets(*userData->reader, loc); - - // Assemble the arguments - Variant::mapType args; - - const XML_Char **attr = attrs; - while (*attr) { - // Convert the C string to a std::string - const std::string key{*(attr++)}; - - // Search the location of the key - SourceLocation keyLoc; - auto it = offs.find(key); - if (it != offs.end()) { - keyLoc = it->second; - } - - // Parse the string, pass the location of the key - std::pair value = VariantReader::parseGenericString( - *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), - keyLoc.getStart()); - args.emplace(key, value.second); - } - - // Call the start function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->start(std::string(name), args, loc); - } - - // Increment the current depth - userData->depth++; -} - -static void xmlEndElementHandler(void *p, const XML_Char *name) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - syncLoggerPosition(parser); - - // Decrement the current depth - userData->depth--; - - // Call the end function - std::string nameStr(name); - if (nameStr != "ousia" || userData->depth > 0) { - stack->end(); - } -} -static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) -{ - XML_Parser parser = static_cast(p); - XMLUserData *userData = static_cast(XML_GetUserData(p)); - ParserStack *stack = userData->stack; - - size_t ulen = len > 0 ? static_cast(len) : 0; - syncLoggerPosition(parser, ulen); - const std::string data = Utils::trim(std::string{s, ulen}); - if (!data.empty()) { - stack->data(data); - } -} - -/* Class XmlParser */ - -void XmlParser::doParse(CharReader &reader, ParserContext &ctx) -{ - // Create the parser object - ScopedExpatXmlParser p{"UTF-8"}; - - // Create the parser stack instance, if we're starting on a non-empty scope, - // try to deduce the parser state - ParserStack stack(ctx, ParserStates::XmlStates); - if (!ctx.getScope().isEmpty()) { - if (!stack.deduceState()) { - return; - } - } - - // Pass the reference to the ParserStack to the XML handler - XMLUserData data(&stack, &reader); - XML_SetUserData(&p, &data); - XML_UseParserAsHandlerArg(&p); - - // Set the callback functions - XML_SetStartElementHandler(&p, xmlStartElementHandler); - XML_SetEndElementHandler(&p, xmlEndElementHandler); - XML_SetCharacterDataHandler(&p, xmlCharacterDataHandler); - - // Feed data into expat while there is data to process - constexpr size_t BUFFER_SIZE = 64 * 1024; - while (true) { - // Fetch a buffer from expat for the input data - char *buf = static_cast(XML_GetBuffer(&p, BUFFER_SIZE)); - if (!buf) { - throw LoggableException{ - "Internal error: XML parser out of memory!"}; - } - - // Read into the buffer - size_t bytesRead = reader.readRaw(buf, BUFFER_SIZE); - - // Parse the data and handle any XML error - if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { - // Fetch the xml parser byte offset - size_t offs = XML_GetCurrentByteIndex(&p); - - // Throw a corresponding exception - XML_Error code = XML_GetErrorCode(&p); - std::string msg = std::string{XML_ErrorString(code)}; - throw LoggableException{"XML: " + msg, - SourceLocation{ctx.getSourceId(), offs}}; - } - - // Abort once there are no more bytes in the stream - if (bytesRead == 0) { - break; - } - } -} } diff --git a/test/formats/osdm/OsdmStreamParserTest.cpp b/test/formats/osdm/OsdmStreamParserTest.cpp deleted file mode 100644 index 46f4cf6..0000000 --- a/test/formats/osdm/OsdmStreamParserTest.cpp +++ /dev/null @@ -1,973 +0,0 @@ -/* - Ousía - Copyright (C) 2014 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include -#include - -#include - -namespace ousia { - -static TerminalLogger logger(std::cerr, true); - -TEST(OsdmStreamParser, empty) -{ - const char *testString = ""; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, oneCharacter) -{ - const char *testString = "a"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); -} - -TEST(OsdmStreamParser, whitespaceElimination) -{ - const char *testString = " hello \t world "; - // 0123456 78901234 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(14U, loc.getEnd()); -} - -TEST(OsdmStreamParser, whitespaceEliminationWithLinebreak) -{ - const char *testString = " hello \n world "; - // 0123456 78901234 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(14U, loc.getEnd()); - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, escapeWhitespace) -{ - const char *testString = " hello\\ \\ world "; - // 012345 67 89012345 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(15U, loc.getEnd()); - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -static void testEscapeSpecialCharacter(const std::string &c) -{ - CharReader charReader(std::string("\\") + c); - OsdmStreamParser reader(charReader, logger); - EXPECT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(c, reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - EXPECT_EQ(0U, loc.getStart()); - EXPECT_EQ(1U + c.size(), loc.getEnd()); -} - -TEST(OsdmStreamParser, escapeSpecialCharacters) -{ - testEscapeSpecialCharacter("\\"); - testEscapeSpecialCharacter("{"); - testEscapeSpecialCharacter("}"); - testEscapeSpecialCharacter("<"); - testEscapeSpecialCharacter(">"); -} - -TEST(OsdmStreamParser, simpleSingleLineComment) -{ - const char *testString = "% This is a single line comment"; - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, singleLineComment) -{ - const char *testString = "a% This is a single line comment\nb"; - // 01234567890123456789012345678901 23 - // 0 1 2 3 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - } - - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(33U, loc.getStart()); - ASSERT_EQ(34U, loc.getEnd()); - } - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, multilineComment) -{ - const char *testString = "a%{ This is a\n\n multiline line comment}%b"; - // 0123456789012 3 456789012345678901234567890 - // 0 1 2 3 4 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - } - - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(40U, loc.getStart()); - ASSERT_EQ(41U, loc.getEnd()); - } - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, nestedMultilineComment) -{ - const char *testString = "a%{%{Another\n\n}%multiline line comment}%b"; - // 0123456789012 3 456789012345678901234567890 - // 0 1 2 3 4 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); - } - - { - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("b", reader.getData().asString()); - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(40U, loc.getStart()); - ASSERT_EQ(41U, loc.getEnd()); - } - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, simpleCommand) -{ - const char *testString = "\\test"; - // 0 12345 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - ASSERT_EQ(0U, reader.getCommandArguments().asMap().size()); - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, simpleCommandWithName) -{ - const char *testString = "\\test#bla"; - // 0 12345678 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - Variant commandArguments = reader.getCommandArguments(); - ASSERT_TRUE(commandArguments.isMap()); - ASSERT_EQ(1U, commandArguments.asMap().size()); - ASSERT_EQ(1U, commandArguments.asMap().count("name")); - ASSERT_EQ("bla", commandArguments.asMap()["name"].asString()); - - loc = commandArguments.asMap()["name"].getLocation(); - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(9U, loc.getEnd()); - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, simpleCommandWithArguments) -{ - const char *testString = "\\test[a=1,b=2,c=\"test\"]"; - // 0 123456789012345 678901 2 - // 0 1 2 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - Variant commandArguments = reader.getCommandArguments(); - ASSERT_TRUE(commandArguments.isMap()); - ASSERT_EQ(3U, commandArguments.asMap().size()); - ASSERT_EQ(1U, commandArguments.asMap().count("a")); - ASSERT_EQ(1U, commandArguments.asMap().count("b")); - ASSERT_EQ(1U, commandArguments.asMap().count("c")); - ASSERT_EQ(1, commandArguments.asMap()["a"].asInt()); - ASSERT_EQ(2, commandArguments.asMap()["b"].asInt()); - ASSERT_EQ("test", commandArguments.asMap()["c"].asString()); - - loc = commandArguments.asMap()["a"].getLocation(); - ASSERT_EQ(8U, loc.getStart()); - ASSERT_EQ(9U, loc.getEnd()); - - loc = commandArguments.asMap()["b"].getLocation(); - ASSERT_EQ(12U, loc.getStart()); - ASSERT_EQ(13U, loc.getEnd()); - - loc = commandArguments.asMap()["c"].getLocation(); - ASSERT_EQ(16U, loc.getStart()); - ASSERT_EQ(22U, loc.getEnd()); - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -TEST(OsdmStreamParser, simpleCommandWithArgumentsAndName) -{ - const char *testString = "\\test#bla[a=1,b=2,c=\"test\"]"; - // 0 1234567890123456789 01234 56 - // 0 1 2 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - - Variant commandName = reader.getCommandName(); - ASSERT_EQ("test", commandName.asString()); - SourceLocation loc = commandName.getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(5U, loc.getEnd()); - - Variant commandArguments = reader.getCommandArguments(); - ASSERT_TRUE(commandArguments.isMap()); - ASSERT_EQ(4U, commandArguments.asMap().size()); - ASSERT_EQ(1U, commandArguments.asMap().count("a")); - ASSERT_EQ(1U, commandArguments.asMap().count("b")); - ASSERT_EQ(1U, commandArguments.asMap().count("c")); - ASSERT_EQ(1U, commandArguments.asMap().count("name")); - ASSERT_EQ(1, commandArguments.asMap()["a"].asInt()); - ASSERT_EQ(2, commandArguments.asMap()["b"].asInt()); - ASSERT_EQ("test", commandArguments.asMap()["c"].asString()); - ASSERT_EQ("bla", commandArguments.asMap()["name"].asString()); - - loc = commandArguments.asMap()["a"].getLocation(); - ASSERT_EQ(12U, loc.getStart()); - ASSERT_EQ(13U, loc.getEnd()); - - loc = commandArguments.asMap()["b"].getLocation(); - ASSERT_EQ(16U, loc.getStart()); - ASSERT_EQ(17U, loc.getEnd()); - - loc = commandArguments.asMap()["c"].getLocation(); - ASSERT_EQ(20U, loc.getStart()); - ASSERT_EQ(26U, loc.getEnd()); - - loc = commandArguments.asMap()["name"].getLocation(); - ASSERT_EQ(5U, loc.getStart()); - ASSERT_EQ(9U, loc.getEnd()); - - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); -} - -static void assertCommand(OsdmStreamParser &reader, const std::string &name, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::COMMAND, reader.parse()); - EXPECT_EQ(name, reader.getCommandName().asString()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertCommand(OsdmStreamParser &reader, const std::string &name, - const Variant::mapType &args, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - assertCommand(reader, name, start, end); - EXPECT_EQ(args, reader.getCommandArguments()); -} - -static void assertData(OsdmStreamParser &reader, const std::string &data, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(data, reader.getData().asString()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getData().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getData().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertFieldStart(OsdmStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::FIELD_START, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertFieldEnd(OsdmStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::FIELD_END, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertEnd(OsdmStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsdmStreamParser::State::END, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -TEST(OsdmStreamParser, fields) -{ - const char *testString = "\\test{a}{b}{c}"; - // 01234567890123 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertData(reader, "a", 6, 7); - assertFieldEnd(reader, 7, 8); - - assertFieldStart(reader, 8, 9); - assertData(reader, "b", 9, 10); - assertFieldEnd(reader, 10, 11); - - assertFieldStart(reader, 11, 12); - assertData(reader, "c", 12, 13); - assertFieldEnd(reader, 13, 14); - assertEnd(reader, 14, 14); -} - -TEST(OsdmStreamParser, dataOutsideField) -{ - const char *testString = "\\test{a}{b} c"; - // 0123456789012 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertData(reader, "a", 6, 7); - assertFieldEnd(reader, 7, 8); - - assertFieldStart(reader, 8, 9); - assertData(reader, "b", 9, 10); - assertFieldEnd(reader, 10, 11); - - assertData(reader, "c", 12, 13); - assertEnd(reader, 13, 13); -} - -TEST(OsdmStreamParser, nestedCommand) -{ - const char *testString = "\\test{a}{\\test2{b} c} d"; - // 012345678 90123456789012 - // 0 1 2 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - - assertFieldStart(reader, 5, 6); - assertData(reader, "a", 6, 7); - assertFieldEnd(reader, 7, 8); - - assertFieldStart(reader, 8, 9); - { - assertCommand(reader, "test2", 9, 15); - assertFieldStart(reader, 15, 16); - assertData(reader, "b", 16, 17); - assertFieldEnd(reader, 17, 18); - } - assertData(reader, "c", 19, 20); - assertFieldEnd(reader, 20, 21); - assertData(reader, "d", 22, 23); - assertEnd(reader, 23, 23); -} - -TEST(OsdmStreamParser, nestedCommandImmediateEnd) -{ - const char *testString = "\\test{\\test2{b}} d"; - // 012345 678901234567 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - { - assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); - assertData(reader, "b", 13, 14); - assertFieldEnd(reader, 14, 15); - } - assertFieldEnd(reader, 15, 16); - assertData(reader, "d", 17, 18); - assertEnd(reader, 18, 18); -} - -TEST(OsdmStreamParser, nestedCommandNoData) -{ - const char *testString = "\\test{\\test2}"; - // 012345 6789012 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertCommand(reader, "test2", 6, 12); - assertFieldEnd(reader, 12, 13); - assertEnd(reader, 13, 13); -} - -TEST(OsdmStreamParser, multipleCommands) -{ - const char *testString = "\\a \\b \\c \\d"; - // 012 345 678 90 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "a", 0, 2); - assertCommand(reader, "b", 3, 5); - assertCommand(reader, "c", 6, 8); - assertCommand(reader, "d", 9, 11); - assertEnd(reader, 11, 11); -} - -TEST(OsdmStreamParser, fieldsWithSpaces) -{ - const char *testString = "\\a {\\b \\c} \n\n {\\d}"; - // 0123 456 789012 3 456 789 - // 0 1 - CharReader charReader(testString); - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, 3, 4); - assertCommand(reader, "b", 4, 6); - assertCommand(reader, "c", 7, 9); - assertFieldEnd(reader, 9, 10); - assertFieldStart(reader, 16, 17); - assertCommand(reader, "d", 17, 19); - assertFieldEnd(reader, 19, 20); - assertEnd(reader, 20, 20); -} - -TEST(OsdmStreamParser, errorNoFieldToStart) -{ - const char *testString = "\\a b {"; - // 012345 - // 0 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "a", 0, 2); - assertData(reader, "b", 3, 4); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 6, 6); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorNoFieldToEnd) -{ - const char *testString = "\\a b }"; - // 012345 - // 0 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "a", 0, 2); - assertData(reader, "b", 3, 4); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 6, 6); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorNoFieldEndNested) -{ - const char *testString = "\\test{\\test2{}}}"; - // 012345 6789012345 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); - assertFieldEnd(reader, 13, 14); - assertFieldEnd(reader, 14, 15); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 16, 16); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorNoFieldEndNestedData) -{ - const char *testString = "\\test{\\test2{}}a}"; - // 012345 67890123456 - // 0 1 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); - assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); - assertFieldEnd(reader, 13, 14); - assertFieldEnd(reader, 14, 15); - assertData(reader, "a", 15, 16); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader, 17, 17); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, beginEnd) -{ - const char *testString = "\\begin{book}\\end{book}"; - // 012345678901 2345678901 - // 0 1 2 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); - assertFieldEnd(reader, 17, 21); - assertEnd(reader, 22, 22); -} - -TEST(OsdmStreamParser, beginEndWithName) -{ - const char *testString = "\\begin{book#a}\\end{book}"; - // 01234567890123 4567890123 - // 0 1 2 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", {{"name", "a"}}, 7, 11); - assertFieldStart(reader, 14, 15); - assertFieldEnd(reader, 19, 23); - assertEnd(reader, 24, 24); -} - -TEST(OsdmStreamParser, beginEndWithNameAndArgs) -{ - const char *testString = "\\begin{book#a}[a=1,b=2,c=\"test\"]\\end{book}"; - // 0123456789012345678901234 56789 01 2345678901 - // 0 1 2 3 4 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", - {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, 32, 33); - assertFieldEnd(reader, 37, 41); - assertEnd(reader, 42, 42); -} - -TEST(OsdmStreamParser, beginEndWithNameAndArgsMultipleFields) -{ - const char *testString = - "\\begin{book#a}[a=1,b=2,c=\"test\"]{a \\test}{b \\test{}}\\end{book}"; - // 0123456789012345678901234 56789 01234 567890123 45678901 2345678901 - // 0 1 2 3 4 5 6 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", - {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, 32, 33); - assertData(reader, "a", 33, 34); - assertCommand(reader, "test", Variant::mapType{}, 35, 40); - assertFieldEnd(reader, 40, 41); - assertFieldStart(reader, 41, 42); - assertData(reader, "b", 42, 43); - assertCommand(reader, "test", Variant::mapType{}, 44, 49); - assertFieldStart(reader, 49, 50); - assertFieldEnd(reader, 50, 51); - assertFieldEnd(reader, 51, 52); - assertFieldStart(reader, 52, 53); - assertFieldEnd(reader, 57, 61); - assertEnd(reader, 62, 62); -} - -TEST(OsdmStreamParser, beginEndWithData) -{ - const char *testString = "\\begin{book}a\\end{book}"; - // 0123456789012 3456789012 - // 0 1 2 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); - assertData(reader, "a", 12, 13); - assertFieldEnd(reader, 18, 22); - assertEnd(reader, 23, 23); -} - -TEST(OsdmStreamParser, beginEndWithCommand) -{ - const char *testString = "\\begin{book}\\a{test}\\end{book}"; - // 012345678901 23456789 0123456789 - // 0 1 2 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); - assertCommand(reader, "a", 12, 14); - assertFieldStart(reader, 14, 15); - assertData(reader, "test", 15, 19); - assertFieldEnd(reader, 19, 20); - assertFieldEnd(reader, 25, 29); - assertEnd(reader, 30, 30); -} - -TEST(OsdmStreamParser, errorBeginNoBraceOpen) -{ - const char *testString = "\\begin a"; - // 01234567 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertData(reader, "a", 7, 8); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorBeginNoIdentifier) -{ - const char *testString = "\\begin{!"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorBeginNoBraceClose) -{ - const char *testString = "\\begin{a"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorBeginNoName) -{ - const char *testString = "\\begin{a#}"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "a"); - ASSERT_TRUE(logger.hasError()); - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertEnd(reader); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorEndNoBraceOpen) -{ - const char *testString = "\\end a"; - // 012345 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertData(reader, "a", 5, 6); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorEndNoIdentifier) -{ - const char *testString = "\\end{!"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorEndNoBraceClose) -{ - const char *testString = "\\end{a"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorEndNoBegin) -{ - const char *testString = "\\end{a}"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, errorBeginEndMismatch) -{ - const char *testString = "\\begin{a} \\begin{b} test \\end{a}"; - // 0123456789 012345678901234 5678901 - // 0 1 2 3 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "a", 7, 8); - assertFieldStart(reader, 10, 11); - assertCommand(reader, "b", 17, 18); - assertFieldStart(reader, 20, 24); - assertData(reader, "test", 20, 24); - ASSERT_FALSE(logger.hasError()); - ASSERT_THROW(reader.parse(), LoggableException); - ASSERT_TRUE(logger.hasError()); -} - -TEST(OsdmStreamParser, commandWithNSSep) -{ - const char *testString = "\\test1:test2"; - // 012345678901 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test1:test2", 0, 12); - assertEnd(reader, 12, 12); -} - -TEST(OsdmStreamParser, beginEndWithNSSep) -{ - const char *testString = "\\begin{test1:test2}\\end{test1:test2}"; - // 0123456789012345678 90123456789012345 - // 0 1 2 3 - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - assertCommand(reader, "test1:test2", 7, 18); - assertFieldStart(reader, 19, 20); - assertFieldEnd(reader, 24, 35); - assertEnd(reader, 36, 36); -} - -TEST(OsdmStreamParser, errorBeginNSSep) -{ - const char *testString = "\\begin:test{blub}\\end{blub}"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "blub"); - ASSERT_TRUE(logger.hasError()); - assertFieldStart(reader); - assertFieldEnd(reader); - assertEnd(reader); -} - -TEST(OsdmStreamParser, errorEndNSSep) -{ - const char *testString = "\\begin{blub}\\end:test{blub}"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - assertCommand(reader, "blub"); - assertFieldStart(reader); - ASSERT_FALSE(logger.hasError()); - assertFieldEnd(reader); - ASSERT_TRUE(logger.hasError()); - assertEnd(reader); -} - -TEST(OsdmStreamParser, errorEmptyNs) -{ - const char *testString = "\\test:"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "test"); - ASSERT_TRUE(logger.hasError()); - assertData(reader, ":"); - assertEnd(reader); -} - -TEST(OsdmStreamParser, errorRepeatedNs) -{ - const char *testString = "\\test::"; - CharReader charReader(testString); - - OsdmStreamParser reader(charReader, logger); - - logger.reset(); - ASSERT_FALSE(logger.hasError()); - assertCommand(reader, "test"); - ASSERT_TRUE(logger.hasError()); - assertData(reader, "::"); - assertEnd(reader); -} -} - diff --git a/test/formats/osdmx/OsdmxParserTest.cpp b/test/formats/osdmx/OsdmxParserTest.cpp deleted file mode 100644 index c0fb50d..0000000 --- a/test/formats/osdmx/OsdmxParserTest.cpp +++ /dev/null @@ -1,314 +0,0 @@ -/* - Ousía - Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace ousia { - -namespace RttiTypes { -extern const Rtti Document; -extern const Rtti Domain; -extern const Rtti Typesystem; -} - -struct XmlStandaloneEnvironment : public StandaloneEnvironment { - XmlParser xmlParser; - FileLocator fileLocator; - - XmlStandaloneEnvironment(ConcreteLogger &logger) - : StandaloneEnvironment(logger) - { - fileLocator.addDefaultSearchPaths(); - fileLocator.addUnittestSearchPath("xmlparser"); - - registry.registerDefaultExtensions(); - registry.registerParser({"text/vnd.ousia.oxm", "text/vnd.ousia.oxd"}, - {&RttiTypes::Node}, &xmlParser); - registry.registerResourceLocator(&fileLocator); - } -}; - -static TerminalLogger logger(std::cerr, true); - -TEST(XmlParser, mismatchedTag) -{ - XmlStandaloneEnvironment env(logger); - env.parse("mismatchedTag.oxm", "", "", RttiSet{&RttiTypes::Document}); - ASSERT_TRUE(logger.hasError()); -} - -TEST(XmlParser, generic) -{ - XmlStandaloneEnvironment env(logger); - env.parse("generic.oxm", "", "", RttiSet{&RttiTypes::Node}); -#ifdef MANAGER_GRAPHVIZ_EXPORT - env.manager.exportGraphviz("xmlDocument.dot"); -#endif -} - -static void checkAttributes(Handle expected, - Handle desc) -{ - if (expected == nullptr) { - ASSERT_TRUE(desc->getAttributesDescriptor()->getAttributes().empty()); - } else { - ASSERT_EQ(expected->getName(), - desc->getAttributesDescriptor()->getName()); - auto &attrs_exp = expected->getAttributes(); - auto &attrs = desc->getAttributesDescriptor()->getAttributes(); - ASSERT_EQ(attrs_exp.size(), attrs.size()); - for (size_t i = 0; i < attrs_exp.size(); i++) { - ASSERT_EQ(attrs_exp[i]->getName(), attrs[i]->getName()); - ASSERT_EQ(attrs_exp[i]->getType(), attrs[i]->getType()); - ASSERT_EQ(attrs_exp[i]->isOptional(), attrs[i]->isOptional()); - ASSERT_EQ(attrs_exp[i]->getDefaultValue(), - attrs[i]->getDefaultValue()); - } - } -} - -static void checkStructuredClass( - Handle n, const std::string &name, Handle domain, - Variant cardinality = Cardinality::any(), - Handle attributesDescriptor = nullptr, - Handle superclass = nullptr, bool transparent = false, - bool root = false) -{ - ASSERT_FALSE(n == nullptr); - Handle sc = n.cast(); - ASSERT_FALSE(sc == nullptr); - ASSERT_EQ(name, sc->getName()); - ASSERT_EQ(domain, sc->getParent()); - ASSERT_EQ(cardinality, sc->getCardinality()); - ASSERT_EQ(transparent, sc->isTransparent()); - ASSERT_EQ(root, sc->hasRootPermission()); - checkAttributes(attributesDescriptor, sc); -} - -static Rooted checkStructuredClass( - const std::string &resolve, const std::string &name, Handle domain, - Variant cardinality = Cardinality::any(), - Handle attributesDescriptor = nullptr, - Handle superclass = nullptr, bool transparent = false, - bool root = false) -{ - auto res = domain->resolve(&RttiTypes::StructuredClass, resolve); - if (res.size() != 1) { - throw OusiaException("resolution error!"); - } - Handle sc = res[0].node.cast(); - checkStructuredClass(sc, name, domain, cardinality, attributesDescriptor, - superclass, transparent, root); - return sc; -} - -static void checkAnnotationClass( - Handle n, const std::string &name, Handle domain, - Handle attributesDescriptor = nullptr) -{ - ASSERT_FALSE(n == nullptr); - Handle ac = n.cast(); - ASSERT_FALSE(ac == nullptr); - ASSERT_EQ(name, ac->getName()); - ASSERT_EQ(domain, ac->getParent()); - checkAttributes(attributesDescriptor, ac); -} - -static Rooted checkAnnotationClass( - const std::string &resolve, const std::string &name, Handle domain, - Handle attributesDescriptor = nullptr) -{ - auto res = domain->resolve(&RttiTypes::AnnotationClass, resolve); - if (res.size() != 1) { - throw OusiaException("resolution error!"); - } - Handle ac = res[0].node.cast(); - checkAnnotationClass(ac, name, domain, attributesDescriptor); - return ac; -} - -static void checkFieldDescriptor( - Handle n, const std::string &name, Handle parent, - NodeVector children, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - ASSERT_FALSE(n == nullptr); - Handle field = n.cast(); - ASSERT_FALSE(field.isNull()); - ASSERT_EQ(name, field->getName()); - ASSERT_EQ(parent, field->getParent()); - ASSERT_EQ(type, field->getFieldType()); - ASSERT_EQ(primitiveType, field->getPrimitiveType()); - ASSERT_EQ(optional, field->isOptional()); - // check the children. - ASSERT_EQ(children.size(), field->getChildren().size()); - for (unsigned int c = 0; c < children.size(); c++) { - ASSERT_EQ(children[c], field->getChildren()[c]); - } -} - -static void checkFieldDescriptor( - Handle desc, Handle parent, - NodeVector children, - const std::string &name = DEFAULT_FIELD_NAME, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - auto res = desc->resolve(&RttiTypes::FieldDescriptor, name); - ASSERT_EQ(1, res.size()); - checkFieldDescriptor(res[0].node, name, parent, children, type, - primitiveType, optional); -} - -static void checkFieldDescriptor( - Handle desc, NodeVector children, - const std::string &name = DEFAULT_FIELD_NAME, - FieldDescriptor::FieldType type = FieldDescriptor::FieldType::TREE, - Handle primitiveType = nullptr, bool optional = false) -{ - checkFieldDescriptor(desc, desc, children, name, type, primitiveType, - optional); -} - -TEST(XmlParser, domainParsing) -{ - XmlStandaloneEnvironment env(logger); - Rooted book_domain_node = - env.parse("book_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(book_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - // check the domain node. - Rooted book_domain = book_domain_node.cast(); - ASSERT_EQ("book", book_domain->getName()); - // get the book struct node. - Cardinality single; - single.merge({1}); - Rooted bookAuthor{ - new StructType(book_domain->getManager(), "", nullptr)}; - bookAuthor->addAttribute( - {new Attribute(book_domain->getManager(), "author", - env.project->getSystemTypesystem()->getStringType(), - "")}, - logger); - Rooted book = checkStructuredClass( - "book", "book", book_domain, single, bookAuthor, nullptr, false, true); - // get the chapter struct node. - Rooted chapter = - checkStructuredClass("chapter", "chapter", book_domain); - Rooted section = - checkStructuredClass("section", "section", book_domain); - Rooted subsection = - checkStructuredClass("subsection", "subsection", book_domain); - Rooted paragraph = - checkStructuredClass("paragraph", "paragraph", book_domain, - Cardinality::any(), nullptr, nullptr, true, false); - Rooted text = - checkStructuredClass("text", "text", book_domain, Cardinality::any(), - nullptr, nullptr, true, false); - - // check the FieldDescriptors. - checkFieldDescriptor(book, {chapter, paragraph}); - checkFieldDescriptor(chapter, {section, paragraph}); - checkFieldDescriptor(section, {subsection, paragraph}); - checkFieldDescriptor(subsection, {paragraph}); - checkFieldDescriptor(paragraph, {text}); - checkFieldDescriptor( - text, {}, DEFAULT_FIELD_NAME, FieldDescriptor::FieldType::PRIMITIVE, - env.project->getSystemTypesystem()->getStringType(), false); - - // check parent handling using the headings domain. - Rooted headings_domain_node = - env.parse("headings_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(headings_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - Rooted headings_domain = headings_domain_node.cast(); - // now there should be a heading struct. - Rooted heading = - checkStructuredClass("heading", "heading", headings_domain, single, - nullptr, nullptr, true, false); - // which should be a reference to the paragraph descriptor. - checkFieldDescriptor(heading, paragraph, {text}); - // and each struct in the book domain (except for text) should have a - // heading field now. - checkFieldDescriptor(book, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(chapter, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(section, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(subsection, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - checkFieldDescriptor(paragraph, {heading}, "heading", - FieldDescriptor::FieldType::SUBTREE, nullptr, true); - - // check annotation handling using the comments domain. - Rooted comments_domain_node = - env.parse("comments_domain.oxm", "", "", RttiSet{&RttiTypes::Domain}); - ASSERT_FALSE(comments_domain_node == nullptr); - ASSERT_FALSE(logger.hasError()); - Rooted comments_domain = comments_domain_node.cast(); - // now we should be able to find a comment annotation. - Rooted comment_anno = - checkAnnotationClass("comment", "comment", comments_domain); - // as well as a comment struct - Rooted comment = - checkStructuredClass("comment", "comment", comments_domain); - // and a reply struct - Rooted reply = - checkStructuredClass("reply", "reply", comments_domain); - // check the fields for each of them. - { - std::vector> descs{comment_anno, comment, reply}; - for (auto &d : descs) { - checkFieldDescriptor(d, {paragraph}, "content", - FieldDescriptor::FieldType::SUBTREE, nullptr, - false); - checkFieldDescriptor(d, {reply}, "replies", - FieldDescriptor::FieldType::SUBTREE, nullptr, - false); - } - } - // paragraph should have comment as child now as well. - checkFieldDescriptor(paragraph, {text, comment}); - // as should heading, because it references the paragraph default field. - checkFieldDescriptor(heading, paragraph, {text, comment}); -} - -TEST(XmlParser, documentParsing) -{ - XmlStandaloneEnvironment env(logger); - Rooted book_domain_node = - env.parse("simple_book.oxd", "", "", RttiSet{&RttiTypes::Document}); - //TODO: Check result -} -} - -- cgit v1.2.3 From 2659b4595d809cbd69a77e5ff7e2fc08d225f065 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:02:54 +0100 Subject: Tidied OsxmlEventParser up, implemented correct whitespace handling, started to write unit tests for the osxml parser --- CMakeLists.txt | 93 +++--- src/core/common/Utils.hpp | 21 +- src/core/common/WhitespaceHandler.hpp | 60 ++++ src/formats/osxml/OsxmlAttributeLocator.cpp | 144 ++++++++++ src/formats/osxml/OsxmlAttributeLocator.hpp | 67 +++++ src/formats/osxml/OsxmlEventParser.cpp | 425 +++++++++++++++------------- src/formats/osxml/OsxmlEventParser.hpp | 44 +-- test/formats/osml/OsmlStreamParserTest.cpp | 1 + test/formats/osxml/OsxmlEventParserTest.cpp | 222 +++++++++++++++ 9 files changed, 811 insertions(+), 266 deletions(-) create mode 100644 src/formats/osxml/OsxmlAttributeLocator.cpp create mode 100644 src/formats/osxml/OsxmlAttributeLocator.hpp create mode 100644 test/formats/osxml/OsxmlEventParserTest.cpp (limited to 'src/formats') diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e3b90f..bdc9541 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,9 +147,9 @@ ADD_LIBRARY(ousia_core src/core/model/RootNode src/core/model/Style src/core/model/Typesystem -# src/core/parser/Parser -# src/core/parser/ParserContext -# src/core/parser/ParserScope + src/core/parser/Parser + src/core/parser/ParserContext + src/core/parser/ParserScope # src/core/parser/generic/ParserState # src/core/parser/generic/ParserStateCallbacks # src/core/parser/generic/ParserStateHandler @@ -183,36 +183,37 @@ TARGET_LINK_LIBRARIES(ousia_osml ousia_core ) -#ADD_LIBRARY(ousia_osxml -# src/formats/osxml/osxmlParser -#) +ADD_LIBRARY(ousia_osxml + src/formats/osxml/OsxmlAttributeLocator + src/formats/osxml/OsxmlEventParser +) -#TARGET_LINK_LIBRARIES(ousia_osxml -# ousia_core -# ${EXPAT_LIBRARIES} -#) +TARGET_LINK_LIBRARIES(ousia_osxml + ousia_core + ${EXPAT_LIBRARIES} +) # Resource locators -#ADD_LIBRARY(ousia_filesystem -# src/plugins/filesystem/FileLocator -# src/plugins/filesystem/SpecialPaths -#) +ADD_LIBRARY(ousia_filesystem + src/plugins/filesystem/FileLocator + src/plugins/filesystem/SpecialPaths +) -#TARGET_LINK_LIBRARIES(ousia_filesystem -# ousia_core -# ${Boost_LIBRARIES} -#) +TARGET_LINK_LIBRARIES(ousia_filesystem + ousia_core + ${Boost_LIBRARIES} +) # Output libraries -#ADD_LIBRARY(ousia_html -# src/plugins/html/DemoOutput -#) +ADD_LIBRARY(ousia_html + src/plugins/html/DemoOutput +) -#TARGET_LINK_LIBRARIES(ousia_html -# ousia_core -#) +TARGET_LINK_LIBRARIES(ousia_html + ousia_core +) #ADD_LIBRARY(ousia_mozjs # src/plugins/mozjs/MozJsScriptEngine @@ -247,7 +248,7 @@ IF(TEST) ADD_EXECUTABLE(ousia_test_core test/core/RangeSetTest -# test/core/RegistryTest + test/core/RegistryTest test/core/XMLTest test/core/common/ArgumentTest test/core/common/CharReaderTest @@ -272,7 +273,7 @@ IF(TEST) test/core/model/NodeTest test/core/model/StyleTest test/core/model/TypesystemTest -# test/core/parser/ParserScopeTest + test/core/parser/ParserScopeTest # test/core/parser/ParserStackTest # test/core/parser/ParserStateTest test/core/parser/utils/TokenizerTest @@ -311,15 +312,15 @@ IF(TEST) # ousia_css # ) -# ADD_EXECUTABLE(ousia_test_html -# test/plugins/html/DemoOutputTest -# ) + ADD_EXECUTABLE(ousia_test_html + test/plugins/html/DemoOutputTest + ) -# TARGET_LINK_LIBRARIES(ousia_test_html -# ${GTEST_LIBRARIES} -# ousia_core -# ousia_html -# ) + TARGET_LINK_LIBRARIES(ousia_test_html + ${GTEST_LIBRARIES} + ousia_core + ousia_html + ) ADD_EXECUTABLE(ousia_test_osml test/formats/osml/OsmlStreamParserTest @@ -331,16 +332,16 @@ IF(TEST) ousia_osml ) -# ADD_EXECUTABLE(ousia_test_osxml -# test/plugins/xml/XmlParserTest -# ) + ADD_EXECUTABLE(ousia_test_osxml + test/formats/osxml/OsxmlEventParserTest + ) -# TARGET_LINK_LIBRARIES(ousia_test_osxml -# ${GTEST_LIBRARIES} -# ousia_core -# ousia_osml -# ousia_filesystem -# ) + TARGET_LINK_LIBRARIES(ousia_test_osxml + ${GTEST_LIBRARIES} + ousia_core + ousia_osxml + ousia_filesystem + ) # ADD_EXECUTABLE(ousia_test_mozjs # test/plugins/mozjs/MozJsScriptEngineTest @@ -354,11 +355,11 @@ IF(TEST) # Register the unit tests ADD_TEST(ousia_test_core ousia_test_core) -# ADD_TEST(ousia_test_filesystem ousia_test_filesystem) + ADD_TEST(ousia_test_filesystem ousia_test_filesystem) # ADD_TEST(ousia_test_css ousia_test_css) -# ADD_TEST(ousia_test_html ousia_test_html) + ADD_TEST(ousia_test_html ousia_test_html) ADD_TEST(ousia_test_osml ousia_test_osml) -# ADD_TEST(ousia_test_osxml ousia_test_osxml) + ADD_TEST(ousia_test_osxml ousia_test_osxml) # ADD_TEST(ousia_test_mozjs ousia_test_mozjs) ENDIF() diff --git a/src/core/common/Utils.hpp b/src/core/common/Utils.hpp index 16a9136..8361973 100644 --- a/src/core/common/Utils.hpp +++ b/src/core/common/Utils.hpp @@ -119,9 +119,26 @@ public: */ template static std::pair trim(const T &s, Filter f) + { + return trim(s, s.size(), f); + } + + /** + * Trims the given string or vector of chars by returning the start and end + * index. + * + * @param s is the container that should be trimmed. + * @param len is the number of elements in the container. + * @param f is a function that returns true for values that should be + * removed. + * @return start and end index. Note that "end" points at the character + * beyond the end, thus "end" minus "start" + */ + template + static std::pair trim(const T &s, size_t len, Filter f) { size_t start = 0; - for (size_t i = 0; i < s.size(); i++) { + for (size_t i = 0; i < len; i++) { if (!f(s[i])) { start = i; break; @@ -129,7 +146,7 @@ public: } size_t end = 0; - for (ssize_t i = s.size() - 1; i >= static_cast(start); i--) { + for (ssize_t i = len - 1; i >= static_cast(start); i--) { if (!f(s[i])) { end = i + 1; break; diff --git a/src/core/common/WhitespaceHandler.hpp b/src/core/common/WhitespaceHandler.hpp index 79e0518..ed52ea3 100644 --- a/src/core/common/WhitespaceHandler.hpp +++ b/src/core/common/WhitespaceHandler.hpp @@ -97,6 +97,25 @@ public: * @param end is the end byte offset of the given character. */ void append(char c, size_t start, size_t end) + { + append(c, start, end, textBuf, textStart, textEnd); + } + + /** + * Static version of PreservingWhitespaceHandler append + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + * @param textBuf is a reference at the text buffer that is to be used. + * @param textStart is a reference at the text start variable that is to be + * used. + * @param textEnd is a reference at the text end variable that is to be + * used. + */ + static void append(char c, size_t start, size_t end, + std::vector &textBuf, size_t &textStart, + size_t &textEnd) { if (textBuf.empty()) { textStart = start; @@ -129,6 +148,27 @@ public: * @param end is the end byte offset of the given character. */ void append(char c, size_t start, size_t end) + { + append(c, start, end, textBuf, textStart, textEnd, whitespaceBuf); + } + + /** + * Static version of TrimmingWhitespaceHandler append + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + * @param textBuf is a reference at the text buffer that is to be used. + * @param textStart is a reference at the text start variable that is to be + * used. + * @param textEnd is a reference at the text end variable that is to be + * used. + * @param whitespaceBuf is a reference at the buffer for storing whitespace + * characters. + */ + static void append(char c, size_t start, size_t end, + std::vector &textBuf, size_t &textStart, + size_t &textEnd, std::vector &whitespaceBuf) { // Handle whitespace characters if (Utils::isWhitespace(c)) { @@ -174,6 +214,26 @@ public: * @param end is the end byte offset of the given character. */ void append(char c, size_t start, size_t end) + { + append(c, start, end, textBuf, textStart, textEnd, hasWhitespace); + } + + /** + * Static version of CollapsingWhitespaceHandler append + * + * @param c is the character that should be appended to the internal buffer. + * @param start is the start byte offset of the given character. + * @param end is the end byte offset of the given character. + * @param textBuf is a reference at the text buffer that is to be used. + * @param textStart is a reference at the text start variable that is to be + * used. + * @param textEnd is a reference at the text end variable that is to be + * used. + * @param hasWhitespace is a reference at the "hasWhitespace" flag. + */ + static void append(char c, size_t start, size_t end, + std::vector &textBuf, size_t &textStart, + size_t &textEnd, bool &hasWhitespace) { // Handle whitespace characters if (Utils::isWhitespace(c)) { diff --git a/src/formats/osxml/OsxmlAttributeLocator.cpp b/src/formats/osxml/OsxmlAttributeLocator.cpp new file mode 100644 index 0000000..e37446a --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.cpp @@ -0,0 +1,144 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include + +#include "OsxmlAttributeLocator.hpp" + +namespace ousia { + +/** + * Enum used internally in the statemachine of the xml argument parser. + */ +enum class XmlAttributeState { + IN_TAG_NAME, + SEARCH_ATTR, + IN_ATTR_NAME, + HAS_ATTR_NAME, + HAS_ATTR_EQUALS, + IN_ATTR_DATA +}; + +std::map OsxmlAttributeLocator::locate( + CharReader &reader, size_t offs) +{ + std::map res; + + // Fork the reader, we don't want to mess up the XML parsing process, do we? + CharReaderFork readerFork = reader.fork(); + + // Move the read cursor to the start location, abort if this does not work + if (offs != readerFork.seek(offs)) { + return res; + } + + // Now all we need to do is to implement one half of an XML parser. As this + // is inherently complicated we'll totaly fail at it. Don't care. All we + // want to get is those darn offsets for pretty error messages... (and we + // can assume the XML is valid as it was already read by expat) + XmlAttributeState state = XmlAttributeState::IN_TAG_NAME; + char c; + std::stringstream attrName; + while (readerFork.read(c)) { + // Abort at the end of the tag + if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) { + return res; + } + + // One state machine to rule them all, one state machine to find them, + // One state machine to bring them all and in the darkness bind them + // (the byte offsets) + switch (state) { + case XmlAttributeState::IN_TAG_NAME: + if (Utils::isWhitespace(c)) { + res.emplace("$tag", + SourceLocation{reader.getSourceId(), offs + 1, + readerFork.getOffset() - 1}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + case XmlAttributeState::SEARCH_ATTR: + if (!Utils::isWhitespace(c)) { + state = XmlAttributeState::IN_ATTR_NAME; + attrName << c; + } + break; + case XmlAttributeState::IN_ATTR_NAME: + if (Utils::isWhitespace(c)) { + state = XmlAttributeState::HAS_ATTR_NAME; + } else if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + } else { + attrName << c; + } + break; + case XmlAttributeState::HAS_ATTR_NAME: + if (!Utils::isWhitespace(c)) { + if (c == '=') { + state = XmlAttributeState::HAS_ATTR_EQUALS; + break; + } + // Well, this is a strange XML file... We expected to + // see a '=' here! Try to continue with the + // "HAS_ATTR_EQUALS" state as this state will hopefully + // inlcude some error recovery + } else { + // Skip whitespace here + break; + } + // Fallthrough + case XmlAttributeState::HAS_ATTR_EQUALS: + if (!Utils::isWhitespace(c)) { + if (c == '"') { + // Here we are! We have found the beginning of an + // attribute. Let's quickly lock the current offset away + // in the result map + res.emplace(attrName.str(), + SourceLocation{reader.getSourceId(), + readerFork.getOffset()}); + state = XmlAttributeState::IN_ATTR_DATA; + } else { + // No, this XML file is not well formed. Assume we're in + // an attribute name once again + attrName.str(std::string{&c, 1}); + state = XmlAttributeState::IN_ATTR_NAME; + } + } + break; + case XmlAttributeState::IN_ATTR_DATA: + if (c == '"') { + // We're at the end of the attribute data, set the end + // location + auto it = res.find(attrName.str()); + if (it != res.end()) { + it->second.setEnd(readerFork.getOffset() - 1); + } + + // Reset the attribute name and restart the search + attrName.str(std::string{}); + state = XmlAttributeState::SEARCH_ATTR; + } + break; + } + } + return res; +} +} + diff --git a/src/formats/osxml/OsxmlAttributeLocator.hpp b/src/formats/osxml/OsxmlAttributeLocator.hpp new file mode 100644 index 0000000..f9a3437 --- /dev/null +++ b/src/formats/osxml/OsxmlAttributeLocator.hpp @@ -0,0 +1,67 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * @file OsxmlAttributeLocator.hpp + * + * Contains a class used for locating the byte offsets of the attributes given + * in a XML tag. + * + * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) + */ + +#ifndef _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ +#define _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ + +#include + +namespace ousia { + +// Forward declarations +class CharReader; +class SourceLocation; + +/** + * Class containing one static function for locating the byte offsets of the + * attributes in a XML tag. This are not retrieved by our xml parser, so we have + * to do this manually. + */ +class OsxmlAttributeLocator { +public: + /** + * Function used to reconstruct the location of the attributes of a XML tag + * in the source code. This is necessary, as the xml parser only returns an + * offset to the begining of a tag and not to the position of the individual + * arguments. + * + * @param reader is the char reader from which the character data should be + * read. + * @param offs is a byte offset in the xml file pointing at the "<" + * character of the tag. + * @return a map from attribute keys to the corresponding location + * (including range) of the atribute. Also contains the location of the + * tagname in the form of the virtual attribute "$tag". + */ + static std::map locate(CharReader &reader, + size_t offs); +}; + +} + +#endif /* _OUSIA_OSXML_ATTRIBUTE_LOCATOR_HPP_ */ + diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index 2ef170e..b4aff77 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -18,14 +18,22 @@ #include +#include + +#include #include #include +#include #include +#include +#include "OsxmlAttributeLocator.hpp" #include "OsxmlEventParser.hpp" namespace ousia { +/* Class OsxmlEventParser */ + /** * Class containing data used by the internal functions. */ @@ -43,41 +51,75 @@ public: */ ssize_t annotationEndTagDepth; + /** + * Current character data buffer. + */ + std::vector textBuf; + + /** + * Current whitespace buffer (for the trimming whitspace mode) + */ + std::vector whitespaceBuf; + + /** + * Flag indicating whether a whitespace character was present (for the + * collapsing whitespace mode). + */ + bool hasWhitespace; + + /** + * Current character data start. + */ + size_t textStart; + + /** + * Current character data end. + */ + size_t textEnd; + /** * Default constructor. */ - OsxmlEventParserData() : depth(0), annotationEndTagDepth(-1) {} + OsxmlEventParserData(); /** * Increments the depth. */ - void incrDepth() { depth++; } + void incrDepth(); /** * Decrement the depth and reset the annotationEndTagDepth flag. */ - void decrDepth() - { - if (depth > 0) { - depth--; - } - if (depth < annotationEndTagDepth) { - annotationEndTagDepth = -1; - } - } + void decrDepth(); /** * Returns true if we're currently inside an end tag. */ - bool inAnnotationEndTag() { depth >= annotationEndTagDepth; } + bool inAnnotationEndTag(); + + /** + * Returns true if character data is available. + * + * @return true if character data is available. + */ + bool hasText(); + + /** + * Returns a Variant containing the character data and its location. + * + * @return a string variant containing the text data and the character + * location. + */ + Variant getText(SourceId sourceId); }; -namespace { +/* Class GuardedExpatXmlParser */ + /** * Wrapper class around the XML_Parser pointer which safely frees it whenever * the scope is left (e.g. because an exception was thrown). */ -class ScopedExpatXmlParser { +class GuardedExpatXmlParser { private: /** * Internal pointer to the XML_Parser instance. @@ -86,14 +128,14 @@ private: public: /** - * Constructor of the ScopedExpatXmlParser class. Calls XML_ParserCreateNS + * Constructor of the GuardedExpatXmlParser class. Calls XML_ParserCreateNS * from the expat library. Throws a parser exception if the XML parser * cannot be initialized. * * @param encoding is the protocol-defined encoding passed to expat (or * nullptr if expat should determine the encoding by itself). */ - ScopedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) + GuardedExpatXmlParser(const XML_Char *encoding) : parser(nullptr) { parser = XML_ParserCreate(encoding); if (!parser) { @@ -103,9 +145,9 @@ public: } /** - * Destuctor of the ScopedExpatXmlParser, frees the XML parser instance. + * Destuctor of the GuardedExpatXmlParser, frees the XML parser instance. */ - ~ScopedExpatXmlParser() + ~GuardedExpatXmlParser() { if (parser) { XML_ParserFree(parser); @@ -120,134 +162,20 @@ public: }; /** - * Enum used internally in the statemachine of the micro-xml argument parser. + * Name of the special outer tag used for allowing multiple top-level elements + * in an xml file. */ -enum class XmlAttributeState { - IN_TAG_NAME, - SEARCH_ATTR, - IN_ATTR_NAME, - HAS_ATTR_NAME, - HAS_ATTR_EQUALS, - IN_ATTR_DATA -}; +static const std::string TOP_LEVEL_TAG{"ousia"}; /** - * Function used to reconstruct the location of the attributes of a XML tag in - * the source code. This is necessary, as the xml parser only returns an offset - * to the begining of a tag and not to the position of the individual arguments. - * - * @param reader is the char reader from which the character data should be - * read. - * @param offs is a byte offset in the xml file pointing at the "<" character of - * the tag. - * @return a map from attribute keys to the corresponding location (including - * range) of the atribute. Also contains the location of the tagname in the - * form of the virtual attribute "$tag". + * Prefix used to indicate the start of an annoation (note the trailing colon) */ -static std::map xmlReconstructAttributeOffsets( - CharReader &reader, size_t offs) -{ - std::map res; - - // Fork the reader, we don't want to mess up the XML parsing process, do we? - CharReaderFork readerFork = reader.fork(); - - // Move the read cursor to the start location, abort if this does not work - if (!location.isValid() || offs != readerFork.seek(offs)) { - return res; - } - - // Now all we need to do is to implement one half of an XML parser. As this - // is inherently complicated we'll totaly fail at it. Don't care. All we - // want to get is those darn offsets for pretty error messages... (and we - // can assume the XML is valid as it was already read by expat) - XmlAttributeState state = XmlAttributeState::IN_TAG_NAME; - char c; - std::stringstream attrName; - while (readerFork.read(c)) { - // Abort at the end of the tag - if (c == '>' && state != XmlAttributeState::IN_ATTR_DATA) { - return res; - } +static const std::string ANNOTATION_START_PREFIX{"a:start:"}; - // One state machine to rule them all, one state machine to find them, - // One state machine to bring them all and in the darkness bind them - // (the byte offsets) - switch (state) { - case XmlAttributeState::IN_TAG_NAME: - if (Utils::isWhitespace(c)) { - res.emplace("$tag", - SourceLocation{reader.getSourceId(), offs + 1, - readerFork.getOffset() - 1}); - state = XmlAttributeState::SEARCH_ATTR; - } - break; - case XmlAttributeState::SEARCH_ATTR: - if (!Utils::isWhitespace(c)) { - state = XmlAttributeState::IN_ATTR_NAME; - attrName << c; - } - break; - case XmlAttributeState::IN_ATTR_NAME: - if (Utils::isWhitespace(c)) { - state = XmlAttributeState::HAS_ATTR_NAME; - } else if (c == '=') { - state = XmlAttributeState::HAS_ATTR_EQUALS; - } else { - attrName << c; - } - break; - case XmlAttributeState::HAS_ATTR_NAME: - if (!Utils::isWhitespace(c)) { - if (c == '=') { - state = XmlAttributeState::HAS_ATTR_EQUALS; - break; - } - // Well, this is a strange XML file... We expected to - // see a '=' here! Try to continue with the - // "HAS_ATTR_EQUALS" state as this state will hopefully - // inlcude some error recovery - } else { - // Skip whitespace here - break; - } - // Fallthrough - case XmlAttributeState::HAS_ATTR_EQUALS: - if (!Utils::isWhitespace(c)) { - if (c == '"') { - // Here we are! We have found the beginning of an - // attribute. Let's quickly lock the current offset away - // in the result map - res.emplace(attrName.str(), - SourceLocation{reader.getSourceId(), - readerFork.getOffset()}); - state = XmlAttributeState::IN_ATTR_DATA; - } else { - // No, this XML file is not well formed. Assume we're in - // an attribute name once again - attrName.str(std::string{&c, 1}); - state = XmlAttributeState::IN_ATTR_NAME; - } - } - break; - case XmlAttributeState::IN_ATTR_DATA: - if (c == '"') { - // We're at the end of the attribute data, set the end - // location - auto it = res.find(attrName.str()); - if (it != res.end()) { - it->second.setEnd(readerFork.getOffset() - 1); - } - - // Reset the attribute name and restart the search - attrName.str(std::string{}); - state = XmlAttributeState::SEARCH_ATTR; - } - break; - } - } - return res; -} +/** + * Prefix used to indicate the end of an annotation. + */ +static const std::string ANNOTATION_END_PREFIX{"a:end"}; /** * Synchronizes the position of the xml parser with the default location of the @@ -268,22 +196,12 @@ static SourceLocation xmlSyncLoggerPosition(XML_Parser p, size_t len = 0) size_t offs = XML_GetCurrentByteIndex(p); SourceLocation loc = SourceLocation{parser->getReader().getSourceId(), offs, offs + len}; - parser->getLogger().setDefaultLocation(location); + parser->getLogger().setDefaultLocation(loc); // Return the fetched location return loc; } -/** - * Prefix used to indicate the start of an annoation, - */ -static const std::string ANNOTATION_START_PREFIX{"a:start:"}; - -/** - * Prefix used to indicate the end of an annotation. - */ -static const std::string ANNOTATION_END_PREFIX{"a:end"}; - /** * Callback called by eXpat whenever a start handler is reached. */ @@ -292,14 +210,21 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, { // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser XML_Parser p = static_cast(ref); - OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + OsxmlEventParser *parser = + static_cast(XML_GetUserData(p)); + + // If there is any text data in the buffer, issue that first + if (parser->getData().hasText()) { + parser->getEvents().data( + parser->getData().getText(parser->getReader().getSourceId())); + } // Read the argument locations -- this is only a stupid and slow hack, // but it is necessary, as expat doesn't give use the byte offset of the // arguments. std::map attributeOffsets = - xmlReconstructXMLAttributeOffsets(*userData->reader, - XML_GetCurrentByteIndex(p)); + OsxmlAttributeLocator::locate(parser->getReader(), + XML_GetCurrentByteIndex(p)); // Update the logger position SourceLocation loc = xmlSyncLoggerPosition(p); @@ -316,7 +241,8 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // Make sure we're currently not inside an annotation end tag -- this would // be highly illegal! if (parser->getData().inAnnotationEndTag()) { - logger.error("No tags allowed inside an annotation end tag", nameLoc); + parser->getLogger().error( + "No tags allowed inside an annotation end tag", nameLoc); return; } @@ -336,36 +262,33 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // Parse the string, pass the location of the key std::pair value = VariantReader::parseGenericString( - *(attr++), stack->getContext().getLogger(), keyLoc.getSourceId(), + *(attr++), parser->getLogger(), keyLoc.getSourceId(), keyLoc.getStart()); // Set the overall location of the parsed element to the attribute // location - value.second->setLocation(keyLoc); - - // Store the - if (!args.emplace(key, value.second).second) { - parser->getLogger().warning( - std::string("Attribute \"") + key + - "\" defined multiple times, only using first definition", - keyLoc); - } + value.second.setLocation(keyLoc); + + // Store the keys in the map + args.emplace(key, value.second).second; } // Fetch the name of the tag, check for special tags std::string nameStr(name); - if (nameStr == "ousia" && parser->getData().depth == 1) { - // We're in the top-level and the magic "ousia" tag is reached -- just + if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 1) { + // We're in the top-level and the magic tag is reached -- just // ignore it and issue a warning for each argument that has been given for (const auto &arg : args) { - parser->getLogger().warning( - std::string("Ignoring attribute \"") + arg.first + - std::string("\" for magic tag \"ousia\""), - arg.second); + parser->getLogger().warning(std::string("Ignoring attribute \"") + + arg.first + + std::string("\" for magic tag \"") + + TOP_LEVEL_TAG + std::string("\""), + arg.second); } } else if (Utils::startsWith(nameStr, ANNOTATION_START_PREFIX)) { // Assemble a name variant containing the name minus the prefix - Variant nameVar = nameStr.substr(ANNOTATION_START_PREFIX.size()); + Variant nameVar = + Variant::fromString(nameStr.substr(ANNOTATION_START_PREFIX.size())); nameVar.setLocation(nameLoc); // Issue the "annotationStart" event @@ -410,25 +333,34 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, } } -static void xmlEndElementHandler(void *p, const XML_Char *name) +static void xmlEndElementHandler(void *ref, const XML_Char *name) { // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser XML_Parser p = static_cast(ref); - OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); + OsxmlEventParser *parser = + static_cast(XML_GetUserData(p)); // Synchronize the position of the logger with teh position - xmlSyncLoggerPosition(parser); - - // Decrement the current depth - parser->getData().decrDepth(); + xmlSyncLoggerPosition(p); // Abort as long as we're in an annotation end tag if (parser->getData().inAnnotationEndTag()) { + parser->getData().decrDepth(); return; } + // Decrement the current depth + parser->getData().decrDepth(); + + // If there is any text data in the buffer, issue that first + if (parser->getData().hasText()) { + parser->getEvents().data( + parser->getData().getText(parser->getReader().getSourceId())); + } + // Abort if the special ousia tag ends here - if (nameStr == "ousia" && parser->getData().depth == 0) { + std::string nameStr{name}; + if (nameStr == TOP_LEVEL_TAG && parser->getData().depth == 0) { return; } @@ -436,20 +368,105 @@ static void xmlEndElementHandler(void *p, const XML_Char *name) parser->getEvents().fieldEnd(); } -static void xmlCharacterDataHandler(void *p, const XML_Char *s, int len) +static void xmlCharacterDataHandler(void *ref, const XML_Char *s, int len) { // Fetch the XML_Parser pointer p and a pointer at the OsxmlEventParser XML_Parser p = static_cast(ref); - OsxmlEventParser *parser = static_cast(XML_GetUserData(p)); - - // TODO -/* size_t ulen = len > 0 ? static_cast(len) : 0; - syncLoggerPosition(parser, ulen); - const std::string data = Utils::trim(std::string{s, ulen}); - if (!data.empty()) { - stack->data(data); - }*/ + OsxmlEventParser *parser = + static_cast(XML_GetUserData(p)); + + // Abort as long as we're in an annotation end tag + if (parser->getData().inAnnotationEndTag()) { + return; + } + + // Convert the signed (smell the 90's C library here?) length to an usigned + // value + size_t ulen = len > 0 ? static_cast(len) : 0; + + // Synchronize the logger position + SourceLocation loc = xmlSyncLoggerPosition(p, ulen); + + // Fetch some variables for convenience + const WhitespaceMode mode = parser->getWhitespaceMode(); + OsxmlEventParserData &data = parser->getData(); + std::vector &textBuf = data.textBuf; + std::vector &whitespaceBuf = data.whitespaceBuf; + bool &hasWhitespace = data.hasWhitespace; + size_t &textStart = data.textStart; + size_t &textEnd = data.textEnd; + + size_t pos = loc.getStart(); + for (size_t i = 0; i < ulen; i++, pos++) { + switch (mode) { + case WhitespaceMode::PRESERVE: + PreservingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, + textStart, textEnd); + break; + case WhitespaceMode::TRIM: + TrimmingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, + textStart, textEnd, + whitespaceBuf); + break; + case WhitespaceMode::COLLAPSE: + CollapsingWhitespaceHandler::append(s[i], pos, pos + 1, textBuf, + textStart, textEnd, + hasWhitespace); + break; + } + } +} + +/* Class OsxmlEvents */ + +OsxmlEvents::~OsxmlEvents() {} + +/* Class OsxmlEventParser */ + +OsxmlEventParserData::OsxmlEventParserData() + : depth(0), + annotationEndTagDepth(-1), + hasWhitespace(false), + textStart(0), + textEnd(0) +{ +} + +void OsxmlEventParserData::incrDepth() { depth++; } + +void OsxmlEventParserData::decrDepth() +{ + if (depth > 0) { + depth--; + } + if (depth < annotationEndTagDepth) { + annotationEndTagDepth = -1; + } +} + +bool OsxmlEventParserData::inAnnotationEndTag() +{ + return (annotationEndTagDepth > 0) && (depth >= annotationEndTagDepth); } + +bool OsxmlEventParserData::hasText() { return !textBuf.empty(); } + +Variant OsxmlEventParserData::getText(SourceId sourceId) +{ + // Create a variant containing the string data and the location + Variant var = + Variant::fromString(std::string{textBuf.data(), textBuf.size()}); + var.setLocation({sourceId, textStart, textEnd}); + + // Reset the text buffers + textBuf.clear(); + whitespaceBuf.clear(); + hasWhitespace = false; + textStart = 0; + textEnd = 0; + + // Return the variant + return var; } /* Class OsxmlEventParser */ @@ -459,21 +476,22 @@ OsxmlEventParser::OsxmlEventParser(CharReader &reader, OsxmlEvents &events, : reader(reader), events(events), logger(logger), - whitespaceMode(WhitespaceMode::COLLAPSE), + whitespaceMode(WhitespaceMode::TRIM), data(new OsxmlEventParserData()) { } -void OsxmlEventParser::parse(CharReader &reader) +OsxmlEventParser::~OsxmlEventParser() {} + +void OsxmlEventParser::parse() { // Create the parser object - ScopedExpatXmlParser p{"UTF-8"}; + GuardedExpatXmlParser p{"UTF-8"}; // Reset the depth - depth = 0; + data->depth = 0; - // Pass the reference to the ParserStack to the XML handler - XMLUserData data(&stack, &reader); + // Pass the reference to this parser instance to the XML handler XML_SetUserData(&p, this); XML_UseParserAsHandlerArg(&p); @@ -498,7 +516,7 @@ void OsxmlEventParser::parse(CharReader &reader) if (!XML_ParseBuffer(&p, bytesRead, bytesRead == 0)) { throw LoggableException{ "XML: " + std::string{XML_ErrorString(XML_GetErrorCode(&p))}, - xmlSyncLoggerPosition(p)}; + xmlSyncLoggerPosition(&p)}; } // Abort once there are no more bytes in the stream @@ -513,12 +531,17 @@ void OsxmlEventParser::setWhitespaceMode(WhitespaceMode whitespaceMode) this->whitespaceMode = whitespaceMode; } -CharReader &OsxmlEventParser::getCharReader() { return charReader; } +WhitespaceMode OsxmlEventParser::getWhitespaceMode() const +{ + return whitespaceMode; +} + +CharReader &OsxmlEventParser::getReader() const { return reader; } -Logger &OsxmlEventParser::getLogger() { return logger; } +Logger &OsxmlEventParser::getLogger() const { return logger; } -OsxmlEvents &OsxmlEventParser::getEvents() { return events; } +OsxmlEvents &OsxmlEventParser::getEvents() const { return events; } -OsxmlEventParserData &OsxmlEventParser::getData() { return *data; } +OsxmlEventParserData &OsxmlEventParser::getData() const { return *data; } } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index 5319ca6..aa20ea9 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -42,7 +42,7 @@ class Variant; class OsxmlEventParserData; /** - * Interface which defines the callback functions which are called by the + * Interface which defines the callback functions which are called by the * OsxmlEventParser whenever an event occurs. */ class OsxmlEvents { @@ -50,13 +50,13 @@ public: /** * Virtual destructor. */ - virtual ~OsxmlEvents() {} + virtual ~OsxmlEvents(); /** * Called whenever a command starts. Note that this implicitly always starts * the default field of the command. * - * @param name is a string variant containing name and location of the + * @param name is a string variant containing name and location of the * command. * @param args is a map variant containing the arguments that were given * to the command. @@ -67,12 +67,12 @@ public: * Called whenever an annotation starts. Note that this implicitly always * starts the default field of the annotation. * - * @param name is a string variant containing the name of the annotation + * @param name is a string variant containing the name of the annotation * class and the location of the annotation definition. * @param args is a map variant containing the arguments that were given * to the annotation definition. */ - virtual void annotationStart(Variant name, Variant args); + virtual void annotationStart(Variant name, Variant args) = 0; /** * Called whenever the range of an annotation ends. The callee must @@ -85,12 +85,12 @@ public: * ended here. May be empty (or nullptr), if no elementName has been * specified at the end of the annotation. */ - virtual void annotationEnd(Variant name, Variant elementName); + virtual void annotationEnd(Variant name, Variant elementName) = 0; /** - * Called whenever the default field which was implicitly started by + * Called whenever the default field which was implicitly started by * commandStart or annotationStart ends. Note that this does not end the - * range of an annotation, but the default field of the annotation. To + * range of an annotation, but the default field of the annotation. To * signal the end of the annotation this, the annotationEnd method will be * invoked. */ @@ -102,11 +102,10 @@ public: * is not called if the parsing failed, the parser prints an error message * instead. * - * @param data is the already parsed data that should be passed to the + * @param data is the already parsed data that should be passed to the * handler. */ virtual void data(Variant data) = 0; - }; /** @@ -148,7 +147,7 @@ public: * Constructor fo the OsxmlEventParser. Takes a reference at the OsxmlEvents * of which the callback functions are called. * - * @param reader is a reference to the CharReader instance from which the + * @param reader is a reference to the CharReader instance from which the * XML should be read. * @param events is a refence at an instance of the OsxmlEvents class. All * events are forwarded to this class. @@ -157,6 +156,11 @@ public: */ OsxmlEventParser(CharReader &reader, OsxmlEvents &events, Logger &logger); + /** + * Destructor of OsxmlEventParser (needed for unique_ptr to incomplete type) + */ + ~OsxmlEventParser(); + /** * Performs the actual parsing. Reads the XML using eXpat and calles the * callbacks in the event listener instance whenever something interesting @@ -167,38 +171,44 @@ public: /** * Sets the whitespace handling mode. * - * @param whitespaceMode defines how whitespace in the data should be + * @param whitespaceMode defines how whitespace in the data should be * handled. */ void setWhitespaceMode(WhitespaceMode whitespaceMode); + /** + * Returns the current whitespace handling mode. + * + * @return the currently set whitespace handling mode. + */ + WhitespaceMode getWhitespaceMode() const; + /** * Returns the internal CharReader reference. * * @return the CharReader reference. */ - CharReader &getCharReader(); + CharReader &getReader() const; /** * Returns the internal Logger reference. * * @return the internal Logger reference. */ - Logger &getLogger(); + Logger &getLogger() const; /** * Returns the internal OsxmlEvents reference. * * @return the internal OsxmlEvents reference. */ - OsxmlEvents &getEvents(); + OsxmlEvents &getEvents() const; /** * Returns a reference at the internal data. */ - OsxmlEventParserData &getData(); + OsxmlEventParserData &getData() const; }; - } #endif /* _OSXML_EVENT_PARSER_HPP_ */ diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index e5eff05..b944af8 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -28,6 +28,7 @@ namespace ousia { static TerminalLogger logger(std::cerr, true); +//static ConcreteLogger logger; TEST(OsmlStreamParser, empty) { diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp new file mode 100644 index 0000000..06c800f --- /dev/null +++ b/test/formats/osxml/OsxmlEventParserTest.cpp @@ -0,0 +1,222 @@ +/* + Ousía + Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include +#include + +#include + +namespace ousia { + +static TerminalLogger logger(std::cerr, true); +// static ConcreteLogger logger; + +namespace { +enum class OsxmlEvent { + COMMAND_START, + ANNOTATION_START, + ANNOTATION_END, + FIELD_END, + DATA +}; + +class TestOsxmlEventListener : public OsxmlEvents { +public: + std::vector> events; + + void commandStart(Variant name, Variant args) override + { + events.emplace_back(OsxmlEvent::COMMAND_START, + Variant::arrayType{name, args}); + } + + void annotationStart(Variant name, Variant args) override + { + events.emplace_back(OsxmlEvent::ANNOTATION_START, + Variant::arrayType{name, args}); + } + + void annotationEnd(Variant name, Variant elementName) override + { + events.emplace_back(OsxmlEvent::ANNOTATION_END, + Variant::arrayType{name, elementName}); + } + + void fieldEnd() override + { + events.emplace_back(OsxmlEvent::FIELD_END, Variant::arrayType{}); + } + + void data(Variant data) override + { + events.emplace_back(OsxmlEvent::DATA, Variant::arrayType{data}); + } +}; + +static std::vector> parseXml( + const char *testString, + WhitespaceMode whitespaceMode = WhitespaceMode::TRIM) +{ + TestOsxmlEventListener listener; + CharReader reader(testString); + OsxmlEventParser parser(reader, listener, logger); + parser.setWhitespaceMode(whitespaceMode); + parser.parse(); + return listener.events; +} +} + +TEST(OsxmlEventParser, simpleCommandWithArgs) +{ + const char *testString = ""; + // 01234567 89012 3456 78 9012 34 5678 90123 456 + // 0 1 2 3 + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{ + "a", Variant::mapType{ + {"name", "test"}, {"a", 1}, {"b", 2}, {"c", "blub"}}}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString); + ASSERT_EQ(expectedEvents, events); + + // Check the locations (I'll do this one time and then just assume it works) + ASSERT_EQ(1U, events[0].second.asArray()[0].getLocation().getStart()); + ASSERT_EQ(2U, events[0].second.asArray()[0].getLocation().getEnd()); + ASSERT_EQ( + 9U, + events[0].second.asArray()[1].asMap()["name"].getLocation().getStart()); + ASSERT_EQ( + 13U, + events[0].second.asArray()[1].asMap()["name"].getLocation().getEnd()); + ASSERT_EQ( + 18U, + events[0].second.asArray()[1].asMap()["a"].getLocation().getStart()); + ASSERT_EQ( + 19U, events[0].second.asArray()[1].asMap()["a"].getLocation().getEnd()); + ASSERT_EQ( + 24U, + events[0].second.asArray()[1].asMap()["b"].getLocation().getStart()); + ASSERT_EQ( + 25U, events[0].second.asArray()[1].asMap()["b"].getLocation().getEnd()); + ASSERT_EQ( + 30U, + events[0].second.asArray()[1].asMap()["c"].getLocation().getStart()); + ASSERT_EQ( + 34U, events[0].second.asArray()[1].asMap()["c"].getLocation().getEnd()); +} + +TEST(OsxmlEventParser, magicTopLevelTag) +{ + const char *testString = ""; + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}, + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"b", Variant::mapType{}}}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString); + ASSERT_EQ(expectedEvents, events); +} + +TEST(OsxmlEventParser, magicTopLevelTagInside) +{ + const char *testString = ""; + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::COMMAND_START, + Variant::arrayType{{"ousia", Variant::mapType{}}}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString); + ASSERT_EQ(expectedEvents, events); +} + +TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) +{ + const char *testString = " hello \n world "; + // 012345678901 234567890123 + // 0 1 2 + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::DATA, Variant::arrayType{" hello \n world "}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString, WhitespaceMode::PRESERVE); + ASSERT_EQ(expectedEvents, events); + + // Check the location of the text + ASSERT_EQ(3U, events[1].second.asArray()[0].getLocation().getStart()); + ASSERT_EQ(20U, events[1].second.asArray()[0].getLocation().getEnd()); +} + +TEST(OsxmlEventParser, commandWithDataTrimWhitespace) +{ + const char *testString = " hello \n world "; + // 012345678901 234567890123 + // 0 1 2 + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::DATA, Variant::arrayType{"hello \n world"}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString, WhitespaceMode::TRIM); + ASSERT_EQ(expectedEvents, events); + + // Check the location of the text + ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); + ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); +} + +TEST(OsxmlEventParser, commandWithDataCollapseWhitespace) +{ + const char *testString = " hello \n world "; + // 012345678901 234567890123 + // 0 1 2 + + std::vector> expectedEvents{ + {OsxmlEvent::COMMAND_START, + Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::DATA, Variant::arrayType{"hello world"}}, + {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; + + auto events = parseXml(testString, WhitespaceMode::COLLAPSE); + ASSERT_EQ(expectedEvents, events); + + // Check the location of the text + ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); + ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); +} + +} + -- cgit v1.2.3 From 9b4cdfabf6527440d6ffa499cc6b57a44daaeadb Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:05:42 +0100 Subject: Added code for the handling of explicit default fields and improved unit tests --- CMakeLists.txt | 16 +- src/formats/osml/OsmlStreamParser.cpp | 78 +++++-- src/formats/osml/OsmlStreamParser.hpp | 45 +++- test/formats/osml/OsmlStreamParserTest.cpp | 340 +++++++++++++++++------------ 4 files changed, 302 insertions(+), 177 deletions(-) (limited to 'src/formats') diff --git a/CMakeLists.txt b/CMakeLists.txt index bdc9541..d311f7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -290,15 +290,15 @@ IF(TEST) ousia_core ) -# ADD_EXECUTABLE(ousia_test_filesystem -# test/plugins/filesystem/FileLocatorTest -# ) + ADD_EXECUTABLE(ousia_test_filesystem + test/plugins/filesystem/FileLocatorTest + ) -# TARGET_LINK_LIBRARIES(ousia_test_filesystem -# ${GTEST_LIBRARIES} -# ousia_core -# ousia_filesystem -# ) + TARGET_LINK_LIBRARIES(ousia_test_filesystem + ${GTEST_LIBRARIES} + ousia_core + ousia_filesystem + ) # ADD_EXECUTABLE(ousia_test_css # test/plugins/css/Tokenizer diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 6b00eef..6606120 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -60,6 +60,11 @@ public: */ TokenTypeId FieldEnd; + /** + * Id of the default field start token. + */ + TokenTypeId DefaultFieldStart; + /** * Registers the plain format tokens in the internal tokenizer. */ @@ -71,6 +76,7 @@ public: BlockCommentEnd = registerToken("}%"); FieldStart = registerToken("{"); FieldEnd = registerToken("}"); + DefaultFieldStart = registerToken("{!"); } }; @@ -164,7 +170,7 @@ OsmlStreamParser::OsmlStreamParser(CharReader &reader, Logger &logger) : reader(reader), logger(logger), tokenizer(Tokens) { // Place an intial command representing the complete file on the stack - commands.push(Command{"", Variant::mapType{}, true, true, true}); + commands.push(Command{"", Variant::mapType{}, true, true, true, false}); } Variant OsmlStreamParser::parseIdentifier(size_t start, bool allowNSSep) @@ -365,7 +371,7 @@ void OsmlStreamParser::pushCommand(Variant commandName, commands.pop(); } commands.push(Command{std::move(commandName), std::move(commandArguments), - hasRange, false, false}); + hasRange, false, false, false}); } OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) @@ -482,6 +488,29 @@ bool OsmlStreamParser::checkIssueFieldStart() return false; } +bool OsmlStreamParser::closeField() +{ + // Try to end an open field of the current command -- if the current command + // is not inside an open field, end this command and try to close the next + // one + for (int i = 0; i < 2 && commands.size() > 1; i++) { + Command &cmd = commands.top(); + if (!cmd.inRangeField) { + if (cmd.inField) { + cmd.inField = false; + if (cmd.inDefaultField) { + commands.pop(); + } + return true; + } + commands.pop(); + } else { + return false; + } + } + return false; +} + OsmlStreamParser::State OsmlStreamParser::parse() { // Handler for incomming data @@ -579,27 +608,29 @@ OsmlStreamParser::State OsmlStreamParser::parse() } logger.error( "Got field start token \"{\", but no command for which to " - "start the field. Did you mean \"\\{\"?", + "start the field. Write \"\\{\" to insert this sequence as " + "text.", token); } else if (token.type == Tokens.FieldEnd) { - // Try to end an open field of the current command -- if the current - // command is not inside an open field, end this command and try to - // close the next one - for (int i = 0; i < 2 && commands.size() > 1; i++) { - Command &cmd = commands.top(); - if (!cmd.inRangeField) { - if (cmd.inField) { - cmd.inField = false; - return State::FIELD_END; - } - commands.pop(); - } else { - break; - } + if (closeField()) { + return State::FIELD_END; + } + logger.error( + "Got field end token \"}\", but there is no field to end. " + "Write \"\\}\" to insert this sequence as text.", + token); + } else if (token.type == Tokens.DefaultFieldStart) { + // Try to start a default field the first time the token is reached + Command &topCmd = commands.top(); + if (!topCmd.inField) { + topCmd.inField = true; + topCmd.inDefaultField = true; + return State::FIELD_START; } logger.error( - "Got field end token \"}\", but there is no field to end. Did " - "you mean \"\\}\"?", + "Got default field start token \"{!\", but no command for " + "which to start the field. Write \"\\{!\" to insert this " + "sequence as text", token); } else { logger.error("Unexpected token \"" + token.content + "\"", token); @@ -627,14 +658,19 @@ OsmlStreamParser::State OsmlStreamParser::parse() return State::END; } -const Variant &OsmlStreamParser::getCommandName() +const Variant &OsmlStreamParser::getCommandName() const { return commands.top().name; } -const Variant &OsmlStreamParser::getCommandArguments() +const Variant &OsmlStreamParser::getCommandArguments() const { return commands.top().arguments; } + +bool OsmlStreamParser::inDefaultField() const +{ + return commands.top().inRangeField || commands.top().inDefaultField; +} } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 1508012..bb5db65 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -152,10 +152,16 @@ public: */ bool inRangeField; + /** + * Set to true if we are currently in a field that has been especially + * marked as default field (using the "|") syntax. + */ + bool inDefaultField; + /** * Default constructor. */ - Command() : hasRange(false), inField(false), inRangeField(false) {} + Command() : hasRange(false), inField(false), inRangeField(false), inDefaultField() {} /** * Constructor of the Command class. @@ -168,16 +174,19 @@ public: * explicit range. * @param inField is set to true if we currently are inside a field * of this command. - * @param inRangeField is set to true if we currently inside the outer - * field of the command. + * @param inRangeField is set to true if we currently are inside the + * outer field of a ranged command. + * @param inDefaultField is set to true if we currently are in a + * specially marked default field. */ Command(Variant name, Variant arguments, bool hasRange, bool inField, - bool inRangeField) + bool inRangeField, bool inDefaultField) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), inField(inField), - inRangeField(inRangeField) + inRangeField(inRangeField), + inDefaultField(inDefaultField) { } }; @@ -289,6 +298,16 @@ private: */ bool checkIssueFieldStart(); + /** + * Closes a currently open field. Note that the command will be removed from + * the internal command stack if the field that is being closed is a + * field marked as default field. + * + * @return true if the field could be closed, false if there was no field + * to close. + */ + bool closeField(); + public: /** * Constructor of the OsmlStreamParser class. Attaches the new @@ -317,7 +336,7 @@ public: * @return a reference at a variant containing the data parsed by the * "parse" function. */ - const Variant &getData() { return data; } + const Variant &getData() const { return data; } /** * Returns a reference at the internally stored command name. Only valid if @@ -326,7 +345,7 @@ public: * @return a reference at a variant containing name and location of the * parsed command. */ - const Variant &getCommandName(); + const Variant &getCommandName() const; /** * Returns a reference at the internally stored command name. Only valid if @@ -335,14 +354,22 @@ public: * @return a reference at a variant containing arguments given to the * command. */ - const Variant &getCommandArguments(); + const Variant &getCommandArguments() const; + + /** + * Returns true if the current field is the "default" field. This is true if + * the parser either is in the outer range of a range command or inside a + * field that has been especially marked as "default" field (using the "|" + * syntax). + */ + bool inDefaultField() const; /** * Returns a reference at the char reader. * * @return the last internal token location. */ - SourceLocation &getLocation() { return location; } + const SourceLocation &getLocation() const { return location; } }; } diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index b944af8..da9fe8a 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -28,7 +28,88 @@ namespace ousia { static TerminalLogger logger(std::cerr, true); -//static ConcreteLogger logger; +// static ConcreteLogger logger; + +static void assertCommand(OsmlStreamParser &reader, const std::string &name, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); + EXPECT_EQ(name, reader.getCommandName().asString()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertCommand(OsmlStreamParser &reader, const std::string &name, + const Variant::mapType &args, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + assertCommand(reader, name, start, end); + EXPECT_EQ(args, reader.getCommandArguments()); +} + +static void assertData(OsmlStreamParser &reader, const std::string &data, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); + EXPECT_EQ(data, reader.getData().asString()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getData().getLocation().getStart()); + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getData().getLocation().getEnd()); + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertFieldStart(OsmlStreamParser &reader, bool defaultField, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse()); + EXPECT_EQ(defaultField, reader.inDefaultField()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertFieldEnd(OsmlStreamParser &reader, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertEnd(OsmlStreamParser &reader, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} TEST(OsmlStreamParser, empty) { @@ -47,12 +128,7 @@ TEST(OsmlStreamParser, oneCharacter) OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("a", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(0U, loc.getStart()); - ASSERT_EQ(1U, loc.getEnd()); + assertData(reader, "a", 0, 1); } TEST(OsmlStreamParser, whitespaceElimination) @@ -64,12 +140,7 @@ TEST(OsmlStreamParser, whitespaceElimination) OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(14U, loc.getEnd()); + assertData(reader, "hello world", 1, 14); } TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak) @@ -81,13 +152,7 @@ TEST(OsmlStreamParser, whitespaceEliminationWithLinebreak) OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(14U, loc.getEnd()); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertData(reader, "hello world", 1, 14); } TEST(OsmlStreamParser, escapeWhitespace) @@ -99,13 +164,7 @@ TEST(OsmlStreamParser, escapeWhitespace) OsmlStreamParser reader(charReader, logger); - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - ASSERT_EQ("hello world", reader.getData().asString()); - - SourceLocation loc = reader.getData().getLocation(); - ASSERT_EQ(1U, loc.getStart()); - ASSERT_EQ(15U, loc.getEnd()); - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); + assertData(reader, "hello world", 1, 15); } static void testEscapeSpecialCharacter(const std::string &c) @@ -127,6 +186,7 @@ TEST(OsmlStreamParser, escapeSpecialCharacters) testEscapeSpecialCharacter("}"); testEscapeSpecialCharacter("<"); testEscapeSpecialCharacter(">"); + testEscapeSpecialCharacter("|"); } TEST(OsmlStreamParser, simpleSingleLineComment) @@ -347,86 +407,6 @@ TEST(OsmlStreamParser, simpleCommandWithArgumentsAndName) ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); } -static void assertCommand(OsmlStreamParser &reader, const std::string &name, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::COMMAND, reader.parse()); - EXPECT_EQ(name, reader.getCommandName().asString()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertCommand(OsmlStreamParser &reader, const std::string &name, - const Variant::mapType &args, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - assertCommand(reader, name, start, end); - EXPECT_EQ(args, reader.getCommandArguments()); -} - -static void assertData(OsmlStreamParser &reader, const std::string &data, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::DATA, reader.parse()); - EXPECT_EQ(data, reader.getData().asString()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getData().getLocation().getStart()); - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getData().getLocation().getEnd()); - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertFieldStart(OsmlStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::FIELD_START, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertFieldEnd(OsmlStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::FIELD_END, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - -static void assertEnd(OsmlStreamParser &reader, - SourceOffset start = InvalidSourceOffset, - SourceOffset end = InvalidSourceOffset) -{ - ASSERT_EQ(OsmlStreamParser::State::END, reader.parse()); - if (start != InvalidSourceOffset) { - EXPECT_EQ(start, reader.getLocation().getStart()); - } - if (end != InvalidSourceOffset) { - EXPECT_EQ(end, reader.getLocation().getEnd()); - } -} - TEST(OsmlStreamParser, fields) { const char *testString = "\\test{a}{b}{c}"; @@ -436,15 +416,15 @@ TEST(OsmlStreamParser, fields) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertData(reader, "a", 6, 7); assertFieldEnd(reader, 7, 8); - assertFieldStart(reader, 8, 9); + assertFieldStart(reader, false, 8, 9); assertData(reader, "b", 9, 10); assertFieldEnd(reader, 10, 11); - assertFieldStart(reader, 11, 12); + assertFieldStart(reader, false, 11, 12); assertData(reader, "c", 12, 13); assertFieldEnd(reader, 13, 14); assertEnd(reader, 14, 14); @@ -459,11 +439,11 @@ TEST(OsmlStreamParser, dataOutsideField) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertData(reader, "a", 6, 7); assertFieldEnd(reader, 7, 8); - assertFieldStart(reader, 8, 9); + assertFieldStart(reader, false, 8, 9); assertData(reader, "b", 9, 10); assertFieldEnd(reader, 10, 11); @@ -481,14 +461,14 @@ TEST(OsmlStreamParser, nestedCommand) assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertData(reader, "a", 6, 7); assertFieldEnd(reader, 7, 8); - assertFieldStart(reader, 8, 9); + assertFieldStart(reader, false, 8, 9); { assertCommand(reader, "test2", 9, 15); - assertFieldStart(reader, 15, 16); + assertFieldStart(reader, false, 15, 16); assertData(reader, "b", 16, 17); assertFieldEnd(reader, 17, 18); } @@ -507,10 +487,10 @@ TEST(OsmlStreamParser, nestedCommandImmediateEnd) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); { assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, false, 12, 13); assertData(reader, "b", 13, 14); assertFieldEnd(reader, 14, 15); } @@ -527,7 +507,7 @@ TEST(OsmlStreamParser, nestedCommandNoData) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertCommand(reader, "test2", 6, 12); assertFieldEnd(reader, 12, 13); assertEnd(reader, 13, 13); @@ -557,11 +537,11 @@ TEST(OsmlStreamParser, fieldsWithSpaces) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "a", 0, 2); - assertFieldStart(reader, 3, 4); + assertFieldStart(reader, false, 3, 4); assertCommand(reader, "b", 4, 6); assertCommand(reader, "c", 7, 9); assertFieldEnd(reader, 9, 10); - assertFieldStart(reader, 16, 17); + assertFieldStart(reader, false, 16, 17); assertCommand(reader, "d", 17, 19); assertFieldEnd(reader, 19, 20); assertEnd(reader, 20, 20); @@ -612,9 +592,9 @@ TEST(OsmlStreamParser, errorNoFieldEndNested) logger.reset(); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, false, 12, 13); assertFieldEnd(reader, 13, 14); assertFieldEnd(reader, 14, 15); ASSERT_FALSE(logger.hasError()); @@ -633,9 +613,9 @@ TEST(OsmlStreamParser, errorNoFieldEndNestedData) logger.reset(); assertCommand(reader, "test", 0, 5); - assertFieldStart(reader, 5, 6); + assertFieldStart(reader, false, 5, 6); assertCommand(reader, "test2", 6, 12); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, false, 12, 13); assertFieldEnd(reader, 13, 14); assertFieldEnd(reader, 14, 15); assertData(reader, "a", 15, 16); @@ -654,7 +634,7 @@ TEST(OsmlStreamParser, beginEnd) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, true, 12, 13); assertFieldEnd(reader, 17, 21); assertEnd(reader, 22, 22); } @@ -669,7 +649,7 @@ TEST(OsmlStreamParser, beginEndWithName) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "book", {{"name", "a"}}, 7, 11); - assertFieldStart(reader, 14, 15); + assertFieldStart(reader, true, 14, 15); assertFieldEnd(reader, 19, 23); assertEnd(reader, 24, 24); } @@ -685,7 +665,7 @@ TEST(OsmlStreamParser, beginEndWithNameAndArgs) assertCommand(reader, "book", {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, 32, 33); + assertFieldStart(reader, true, 32, 33); assertFieldEnd(reader, 37, 41); assertEnd(reader, 42, 42); } @@ -702,17 +682,17 @@ TEST(OsmlStreamParser, beginEndWithNameAndArgsMultipleFields) assertCommand(reader, "book", {{"name", "a"}, {"a", 1}, {"b", 2}, {"c", "test"}}, 7, 11); - assertFieldStart(reader, 32, 33); + assertFieldStart(reader, false, 32, 33); assertData(reader, "a", 33, 34); assertCommand(reader, "test", Variant::mapType{}, 35, 40); assertFieldEnd(reader, 40, 41); - assertFieldStart(reader, 41, 42); + assertFieldStart(reader, false, 41, 42); assertData(reader, "b", 42, 43); assertCommand(reader, "test", Variant::mapType{}, 44, 49); - assertFieldStart(reader, 49, 50); + assertFieldStart(reader, false, 49, 50); assertFieldEnd(reader, 50, 51); assertFieldEnd(reader, 51, 52); - assertFieldStart(reader, 52, 53); + assertFieldStart(reader, true, 52, 53); assertFieldEnd(reader, 57, 61); assertEnd(reader, 62, 62); } @@ -727,12 +707,45 @@ TEST(OsmlStreamParser, beginEndWithData) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, true, 12, 13); assertData(reader, "a", 12, 13); assertFieldEnd(reader, 18, 22); assertEnd(reader, 23, 23); } +TEST(OsmlStreamParser, beginEndNested) +{ + const char *testString = + "\\begin{a}{b} c \\begin{d}{e}{f} \\g{h} \\end{d}\\end{a}"; + // 012345678901234 5678901234567890 123456 7890123 4567890 + // 0 1 2 3 4 5 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 7, 8); + assertFieldStart(reader, false, 9, 10); + assertData(reader, "b", 10, 11); + assertFieldEnd(reader, 11, 12); + assertFieldStart(reader, true, 13, 14); + assertData(reader, "c", 13, 14); + assertCommand(reader, "d", 22, 23); + assertFieldStart(reader, false, 24, 25); + assertData(reader, "e", 25, 26); + assertFieldEnd(reader, 26, 27); + assertFieldStart(reader, false, 27, 28); + assertData(reader, "f", 28, 29); + assertFieldEnd(reader, 29, 30); + assertFieldStart(reader, true, 31, 32); + assertCommand(reader, "g", 31, 33); + assertFieldStart(reader, false, 33, 34); + assertData(reader, "h", 34, 35); + assertFieldEnd(reader, 35, 36); + assertFieldEnd(reader, 42, 43); + assertFieldEnd(reader, 49, 50); + assertEnd(reader, 51, 51); +} + TEST(OsmlStreamParser, beginEndWithCommand) { const char *testString = "\\begin{book}\\a{test}\\end{book}"; @@ -743,9 +756,9 @@ TEST(OsmlStreamParser, beginEndWithCommand) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "book", 7, 11); - assertFieldStart(reader, 12, 13); + assertFieldStart(reader, true, 12, 13); assertCommand(reader, "a", 12, 14); - assertFieldStart(reader, 14, 15); + assertFieldStart(reader, false, 14, 15); assertData(reader, "test", 15, 19); assertFieldEnd(reader, 19, 20); assertFieldEnd(reader, 25, 29); @@ -873,9 +886,9 @@ TEST(OsmlStreamParser, errorBeginEndMismatch) logger.reset(); assertCommand(reader, "a", 7, 8); - assertFieldStart(reader, 10, 11); + assertFieldStart(reader, true, 10, 11); assertCommand(reader, "b", 17, 18); - assertFieldStart(reader, 20, 24); + assertFieldStart(reader, true, 20, 24); assertData(reader, "test", 20, 24); ASSERT_FALSE(logger.hasError()); ASSERT_THROW(reader.parse(), LoggableException); @@ -904,7 +917,7 @@ TEST(OsmlStreamParser, beginEndWithNSSep) OsmlStreamParser reader(charReader, logger); assertCommand(reader, "test1:test2", 7, 18); - assertFieldStart(reader, 19, 20); + assertFieldStart(reader, true, 19, 20); assertFieldEnd(reader, 24, 35); assertEnd(reader, 36, 36); } @@ -920,7 +933,7 @@ TEST(OsmlStreamParser, errorBeginNSSep) ASSERT_FALSE(logger.hasError()); assertCommand(reader, "blub"); ASSERT_TRUE(logger.hasError()); - assertFieldStart(reader); + assertFieldStart(reader, true); assertFieldEnd(reader); assertEnd(reader); } @@ -934,7 +947,7 @@ TEST(OsmlStreamParser, errorEndNSSep) logger.reset(); assertCommand(reader, "blub"); - assertFieldStart(reader); + assertFieldStart(reader, true); ASSERT_FALSE(logger.hasError()); assertFieldEnd(reader); ASSERT_TRUE(logger.hasError()); @@ -970,5 +983,54 @@ TEST(OsmlStreamParser, errorRepeatedNs) assertData(reader, "::"); assertEnd(reader); } + +TEST(OsmlStreamParser, explicitDefaultField) +{ + const char *testString = "\\a{!b}c"; + // 01234567 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, true, 2, 4); + assertData(reader, "b", 4, 5); + assertFieldEnd(reader, 5, 6); + assertData(reader, "c", 6, 7); + assertEnd(reader, 7, 7); +} + +TEST(OsmlStreamParser, explicitDefaultFieldWithCommand) +{ + const char *testString = "\\a{!\\b}c"; + // 0123 4567 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, true, 2, 4); + assertCommand(reader, "b", 4, 6); + assertFieldEnd(reader, 6, 7); + assertData(reader, "c", 7, 8); + assertEnd(reader, 8, 8); +} + +TEST(OsmlStreamParser, errorFieldAfterExplicitDefaultField) +{ + const char *testString = "\\a{!\\b}{c}"; + // 0123 4567 + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertCommand(reader, "a", 0, 2); + assertFieldStart(reader, true, 2, 4); + assertCommand(reader, "b", 4, 6); + assertFieldEnd(reader, 6, 7); + assertData(reader, "c", 7, 8); + assertEnd(reader, 8, 8); +} + } -- cgit v1.2.3 From 205810b44c980998958dcd857c2cb34a914dc760 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Thu, 12 Feb 2015 16:21:36 +0100 Subject: Implemented annotation start and end field --- contrib/test.osdm | 29 ---- contrib/test.osml | 29 ++++ src/formats/osml/OsmlStreamParser.cpp | 116 ++++++++++++--- src/formats/osml/OsmlStreamParser.hpp | 16 +- test/formats/osml/OsmlStreamParserTest.cpp | 228 ++++++++++++++++++++++++++++- 5 files changed, 363 insertions(+), 55 deletions(-) delete mode 100644 contrib/test.osdm create mode 100644 contrib/test.osml (limited to 'src/formats') diff --git a/contrib/test.osdm b/contrib/test.osdm deleted file mode 100644 index 100bc77..0000000 --- a/contrib/test.osdm +++ /dev/null @@ -1,29 +0,0 @@ -%{ - We're currently inside a block comment. - %{ - Note that block comments can be nested, easily allowing you to comment - out blocks which already contain comments. - }% -}% - -% Well, line comments, as we know them from TeX also work - -\import{meta} -\import{book} - -\domain#special_words{ - \struct#latex - \struct#ousia -} - -\book{ - \include{chapters/chapter1} - \include{chapters/chapter2} - - \begin{note}{Behaviour of "Include"} - Analogous to the `include` command in \latex, \ousia forces the included - file to be *complete* in a sense, that it must not have dangling open - commands. - \end{note} -} - diff --git a/contrib/test.osml b/contrib/test.osml new file mode 100644 index 0000000..100bc77 --- /dev/null +++ b/contrib/test.osml @@ -0,0 +1,29 @@ +%{ + We're currently inside a block comment. + %{ + Note that block comments can be nested, easily allowing you to comment + out blocks which already contain comments. + }% +}% + +% Well, line comments, as we know them from TeX also work + +\import{meta} +\import{book} + +\domain#special_words{ + \struct#latex + \struct#ousia +} + +\book{ + \include{chapters/chapter1} + \include{chapters/chapter2} + + \begin{note}{Behaviour of "Include"} + Analogous to the `include` command in \latex, \ousia forces the included + file to be *complete* in a sense, that it must not have dangling open + commands. + \end{note} +} + diff --git a/src/formats/osml/OsmlStreamParser.cpp b/src/formats/osml/OsmlStreamParser.cpp index 6606120..0174fa4 100644 --- a/src/formats/osml/OsmlStreamParser.cpp +++ b/src/formats/osml/OsmlStreamParser.cpp @@ -65,6 +65,16 @@ public: */ TokenTypeId DefaultFieldStart; + /** + * Id of the annotation start token. + */ + TokenTypeId AnnotationStart; + + /** + * Id of the annotation end token. + */ + TokenTypeId AnnotationEnd; + /** * Registers the plain format tokens in the internal tokenizer. */ @@ -77,6 +87,8 @@ public: FieldStart = registerToken("{"); FieldEnd = registerToken("}"); DefaultFieldStart = registerToken("{!"); + AnnotationStart = registerToken("<\\"); + AnnotationEnd = registerToken("\\>"); } }; @@ -374,7 +386,8 @@ void OsmlStreamParser::pushCommand(Variant commandName, hasRange, false, false, false}); } -OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) +OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start, + bool isAnnotation) { // Parse the commandName as a first identifier Variant commandName = parseIdentifier(start, true); @@ -388,6 +401,9 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) Utils::split(commandName.asString(), ':'); const bool isBegin = commandNameComponents[0] == "begin"; const bool isEnd = commandNameComponents[0] == "end"; + + // Parse the begin or end command + State res = State::COMMAND; if (isBegin || isEnd) { if (commandNameComponents.size() > 1) { logger.error( @@ -396,30 +412,76 @@ OsmlStreamParser::State OsmlStreamParser::parseCommand(size_t start) commandName); } if (isBegin) { - return parseBeginCommand(); + res = parseBeginCommand(); } else if (isEnd) { - return parseEndCommand(); + res = parseEndCommand(); + } + } else { + // Check whether the next character is a '#', indicating the start of + // the command name + Variant commandArgName; + start = reader.getOffset(); + if (reader.expect('#')) { + commandArgName = parseIdentifier(start); + if (commandArgName.asString().empty()) { + logger.error("Expected identifier after \"#\"", commandArgName); + } } + + // Parse the arugments + Variant commandArguments = + parseCommandArguments(std::move(commandArgName)); + + // Push the command onto the command stack + pushCommand(std::move(commandName), std::move(commandArguments), false); } - // Check whether the next character is a '#', indicating the start of the - // command name - Variant commandArgName; - start = reader.getOffset(); - if (reader.expect('#')) { - commandArgName = parseIdentifier(start); - if (commandArgName.asString().empty()) { - logger.error("Expected identifier after \"#\"", commandArgName); + // Check whether a ">" character is the next character that is to be read. + // In that case the current command could be an annotation end command! + char c; + if (reader.fetch(c) && c == '>') { + // Ignore the character after a begin or end command + if (isBegin || isEnd) { + logger.warning( + "Ignoring annotation end character \">\" after special " + "commands \"begin\" or \"end\". Write \"\\>\" to end a " + "\"begin\"/\"end\" enclosed annotation.", + reader); + return res; } - } - // Parse the arugments - Variant commandArguments = parseCommandArguments(std::move(commandArgName)); + // If this should be an annoation, ignore the character + if (isAnnotation) { + logger.warning( + "Ignoring annotation end character \">\" after annotation " + "start command. Write \"\\>\" to end the annotation.", + reader); + } else { + // Make sure no arguments apart from the "name" argument are given + // to an annotation end + Variant::mapType &map = commands.top().arguments.asMap(); + if (!map.empty()) { + if (map.count("name") == 0 || map.size() > 1U) { + logger.error( + "An annotation end command may not have any arguments " + "other than \"name\""); + return res; + } + } - // Push the command onto the command stack - pushCommand(std::move(commandName), std::move(commandArguments), false); + // If we got here, this is a valid ANNOTATION_END command, issue it + reader.peek(c); + reader.consumePeek(); + return State::ANNOTATION_END; + } + } - return State::COMMAND; + // If we're starting an annotation, return the command as annotation start + // instead of command + if (isAnnotation && res == State::COMMAND) { + return State::ANNOTATION_START; + } + return res; } void OsmlStreamParser::parseBlockComment() @@ -522,7 +584,7 @@ OsmlStreamParser::State OsmlStreamParser::parse() const TokenTypeId type = token.type; // Special handling for Backslash and Text - if (type == Tokens.Backslash) { + if (type == Tokens.Backslash || type == Tokens.AnnotationStart) { // Before appending anything to the output data or starting a new // command, check whether FIELD_START has to be issued, as the // current command is a command with range @@ -548,7 +610,8 @@ OsmlStreamParser::State OsmlStreamParser::parse() } // Parse the actual command - State res = parseCommand(token.location.getStart()); + State res = parseCommand(token.location.getStart(), + type == Tokens.AnnotationStart); switch (res) { case State::ERROR: throw LoggableException( @@ -565,6 +628,14 @@ OsmlStreamParser::State OsmlStreamParser::parse() // to the data buffer, use the escape character start as start // location and the peek offset as end location reader.peek(c); // Peek the previously fetched character + + // If this was an annotation start token, add the parsed < to the + // output + if (type == Tokens.AnnotationStart) { + handler.append('<', token.location.getStart(), + token.location.getStart() + 1); + } + handler.append(c, token.location.getStart(), reader.getPeekOffset()); reader.consumePeek(); @@ -632,6 +703,13 @@ OsmlStreamParser::State OsmlStreamParser::parse() "which to start the field. Write \"\\{!\" to insert this " "sequence as text", token); + } else if (token.type == Tokens.AnnotationEnd) { + // We got a single annotation end token "\>" -- simply issue the + // ANNOTATION_END event + Variant annotationName = Variant::fromString(""); + annotationName.setLocation(token.location); + pushCommand(annotationName, Variant::mapType{}, false); + return State::ANNOTATION_END; } else { logger.error("Unexpected token \"" + token.content + "\"", token); } diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index bb5db65..3827118 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -161,7 +161,13 @@ public: /** * Default constructor. */ - Command() : hasRange(false), inField(false), inRangeField(false), inDefaultField() {} + Command() + : hasRange(false), + inField(false), + inRangeField(false), + inDefaultField() + { + } /** * Constructor of the Command class. @@ -179,8 +185,8 @@ public: * @param inDefaultField is set to true if we currently are in a * specially marked default field. */ - Command(Variant name, Variant arguments, bool hasRange, bool inField, - bool inRangeField, bool inDefaultField) + Command(Variant name, Variant arguments, bool hasRange, + bool inField, bool inRangeField, bool inDefaultField) : name(std::move(name)), arguments(std::move(arguments)), hasRange(hasRange), @@ -266,9 +272,11 @@ private: * * @param start is the start byte offset of the command (including the * backslash) + * @param isAnnotation if true, the command is not returned as command, but + * as annotation start. * @return true if a command was actuall parsed, false otherwise. */ - State parseCommand(size_t start); + State parseCommand(size_t start, bool isAnnotation); /** * Function used internally to parse a block comment. diff --git a/test/formats/osml/OsmlStreamParserTest.cpp b/test/formats/osml/OsmlStreamParserTest.cpp index 5f23822..d52fa5b 100644 --- a/test/formats/osml/OsmlStreamParserTest.cpp +++ b/test/formats/osml/OsmlStreamParserTest.cpp @@ -98,6 +98,56 @@ static void assertFieldEnd(OsmlStreamParser &reader, } } +static void assertAnnotationStart(OsmlStreamParser &reader, + const std::string &name, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_START, reader.parse()); + EXPECT_EQ(name, reader.getCommandName().asString()); + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getCommandName().getLocation().getStart()); + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getCommandName().getLocation().getEnd()); + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + +static void assertAnnotationStart(OsmlStreamParser &reader, + const std::string &name, + const Variant::mapType &args, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + assertAnnotationStart(reader, name, start, end); + EXPECT_EQ(args, reader.getCommandArguments()); +} + +static void assertAnnotationEnd(OsmlStreamParser &reader, + const std::string &name, + const std::string &elementName, + SourceOffset start = InvalidSourceOffset, + SourceOffset end = InvalidSourceOffset) +{ + ASSERT_EQ(OsmlStreamParser::State::ANNOTATION_END, reader.parse()); + ASSERT_EQ(name, reader.getCommandName().asString()); + if (!elementName.empty()) { + ASSERT_EQ(1U, reader.getCommandArguments().asMap().size()); + ASSERT_EQ(1U, reader.getCommandArguments().asMap().count("name")); + + auto it = reader.getCommandArguments().asMap().find("name"); + ASSERT_EQ(elementName, it->second.asString()); + } + if (start != InvalidSourceOffset) { + EXPECT_EQ(start, reader.getLocation().getStart()); + } + if (end != InvalidSourceOffset) { + EXPECT_EQ(end, reader.getLocation().getEnd()); + } +} + static void assertEnd(OsmlStreamParser &reader, SourceOffset start = InvalidSourceOffset, SourceOffset end = InvalidSourceOffset) @@ -184,9 +234,6 @@ TEST(OsmlStreamParser, escapeSpecialCharacters) testEscapeSpecialCharacter("\\"); testEscapeSpecialCharacter("{"); testEscapeSpecialCharacter("}"); - testEscapeSpecialCharacter("<"); - testEscapeSpecialCharacter(">"); - testEscapeSpecialCharacter("|"); } TEST(OsmlStreamParser, simpleSingleLineComment) @@ -1035,5 +1082,180 @@ TEST(OsmlStreamParser, errorFieldAfterExplicitDefaultField) assertEnd(reader, 10, 10); } +TEST(OsmlStreamParser, annotationStart) +{ + const char *testString = "<\\a"; + // 0 12 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); + assertEnd(reader, 3, 3); +} + +TEST(OsmlStreamParser, annotationStartWithName) +{ + const char *testString = "<\\annotationWithName#aName"; + // 0 1234567890123456789012345 + // 0 1 2 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart(reader, "annotationWithName", + Variant::mapType{{"name", "aName"}}, 0, 20); + assertEnd(reader, 26, 26); +} + +TEST(OsmlStreamParser, annotationStartWithArguments) +{ + const char *testString = "<\\annotationWithName#aName[a=1,b=2]"; + // 0 1234567890123456789012345678901234 + // 0 1 2 3 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart( + reader, "annotationWithName", + Variant::mapType{{"name", "aName"}, {"a", 1}, {"b", 2}}, 0, 20); + assertEnd(reader, 35, 35); +} + +TEST(OsmlStreamParser, simpleAnnotationStartBeginEnd) +{ + const char *testString = "<\\begin{ab#name}[a=1,b=2] a \\end{ab}\\>"; + // 0 123456789012345678901234567 89012345 67 + // 0 1 2 3 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart( + reader, "ab", Variant::mapType{{"name", "name"}, {"a", 1}, {"b", 2}}, 8, + 10); + assertFieldStart(reader, true, 26, 27); + assertData(reader, "a", 26, 27); + assertFieldEnd(reader, 33, 35); + assertAnnotationEnd(reader, "", "", 36, 38); + assertEnd(reader, 38, 38); +} + +TEST(OsmlStreamParser, annotationEnd) +{ + const char *testString = "\\a>"; + // 012 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationEnd(reader, "a", "", 0, 2); + assertEnd(reader, 3, 3); +} + +TEST(OsmlStreamParser, annotationEndWithName) +{ + const char *testString = "\\a#name>"; + // 01234567 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationEnd(reader, "a", "name", 0, 2); + assertEnd(reader, 8, 8); +} + +TEST(OsmlStreamParser, annotationEndWithNameAsArgs) +{ + const char *testString = "\\a[name=name]>"; + // 01234567890123 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationEnd(reader, "a", "name", 0, 2); + assertEnd(reader, 14, 14); +} + +TEST(OsmlStreamParser, errorAnnotationEndWithArguments) +{ + const char *testString = "\\a[foo=bar]>"; + // 012345678901 + // 0 1 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + logger.reset(); + ASSERT_FALSE(logger.hasError()); + assertCommand(reader, "a", Variant::mapType{{"foo", "bar"}}, 0, 2); + ASSERT_TRUE(logger.hasError()); + assertData(reader, ">", 11, 12); + assertEnd(reader, 12, 12); +} + +TEST(OsmlStreamParser, closingAnnotation) +{ + const char *testString = "<\\a>"; + // 0 123 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertAnnotationStart(reader, "a", Variant::mapType{}, 0, 3); + assertData(reader, ">", 3, 4); + assertEnd(reader, 4, 4); +} + +TEST(OsmlStreamParser, annotationWithFields) +{ + const char *testString = "a <\\b{c}{d}{!e} f \\> g"; + // 012 345678901234567 8901 + // 0 1 2 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertData(reader, "a", 0, 1); + assertAnnotationStart(reader, "b", Variant::mapType{}, 2, 5); + assertFieldStart(reader, false, 5, 6); + assertData(reader, "c", 6, 7); + assertFieldEnd(reader, 7, 8); + assertFieldStart(reader, false, 8, 9); + assertData(reader, "d", 9, 10); + assertFieldEnd(reader, 10, 11); + assertFieldStart(reader, true, 11, 13); + assertData(reader, "e", 13, 14); + assertFieldEnd(reader, 14, 15); + assertData(reader, "f", 16, 17); + assertAnnotationEnd(reader, "", "", 18, 20); + assertData(reader, "g", 21, 22); + assertEnd(reader, 22, 22); +} + +TEST(OsmlStreamParser, annotationStartEscape) +{ + const char *testString = "<\\%test"; + // 0 123456 + // 0 + + CharReader charReader(testString); + + OsmlStreamParser reader(charReader, logger); + + assertData(reader, "<%test", 0, 7); + assertEnd(reader, 7, 7); +} } -- cgit v1.2.3 From ddbcefd960052f3d27fef5f57fc933d269b17857 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:12:13 +0100 Subject: Made flags only one bit wide --- src/formats/osml/OsmlStreamParser.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'src/formats') diff --git a/src/formats/osml/OsmlStreamParser.hpp b/src/formats/osml/OsmlStreamParser.hpp index 3827118..dc3034c 100644 --- a/src/formats/osml/OsmlStreamParser.hpp +++ b/src/formats/osml/OsmlStreamParser.hpp @@ -139,24 +139,24 @@ public: /** * Set to true if this is a command with clear begin and end. */ - bool hasRange; + bool hasRange : 1; /** * Set to true if we are currently inside a field of this command. */ - bool inField; + bool inField : 1; /** * Set to true if we are currently in the range field of the command * (implies inField being set to true). */ - bool inRangeField; + bool inRangeField : 1; /** * Set to true if we are currently in a field that has been especially * marked as default field (using the "|") syntax. */ - bool inDefaultField; + bool inDefaultField : 1; /** * Default constructor. -- cgit v1.2.3 From 19d7c2e400850d06b21acc28733a7cd8ba343d1a Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 00:15:22 +0100 Subject: Renamed OsxmlParser --- src/formats/osxml/OsxmlParser.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'src/formats') diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp index c8b6302..281a49c 100644 --- a/src/formats/osxml/OsxmlParser.hpp +++ b/src/formats/osxml/OsxmlParser.hpp @@ -25,18 +25,18 @@ * @author Andreas Stöckel (astoecke@techfak.uni-bielefeld.de) */ -#ifndef _OUSIA_XML_PARSER_HPP_ -#define _OUSIA_XML_PARSER_HPP_ +#ifndef _OUSIA_OSXML_PARSER_HPP_ +#define _OUSIA_OSXML_PARSER_HPP_ #include namespace ousia { /** - * The XmlParser class implements parsing the various types of Ousía XML - * documents using the expat stream XML parser. + * The OsxmlParser class implements parsing the various types of Ousía XML + * documents using the OsxmlEventParser and Stack classes. */ -class XmlParser : public Parser { +class OsxmlParser : public Parser { protected: /** * Parses the given input stream as XML file and returns the parsed @@ -51,5 +51,5 @@ protected: } -#endif /* _OUSIA_XML_PARSER_HPP_ */ +#endif /* _OUSIA_OSXML_PARSER_HPP_ */ -- cgit v1.2.3 From b7ffeb3dca889aee1c878e2ef0f07644f910dba2 Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 20:58:05 +0100 Subject: Made OsxmlEvents interface consistent with Stack commands --- src/formats/osxml/OsxmlEventParser.cpp | 2 +- src/formats/osxml/OsxmlEventParser.hpp | 24 +++++++++-------- test/formats/osxml/OsxmlEventParserTest.cpp | 41 +++++++++++++---------------- 3 files changed, 32 insertions(+), 35 deletions(-) (limited to 'src/formats') diff --git a/src/formats/osxml/OsxmlEventParser.cpp b/src/formats/osxml/OsxmlEventParser.cpp index b4aff77..7404960 100644 --- a/src/formats/osxml/OsxmlEventParser.cpp +++ b/src/formats/osxml/OsxmlEventParser.cpp @@ -329,7 +329,7 @@ static void xmlStartElementHandler(void *ref, const XML_Char *name, // Just issue a "commandStart" event in any other case Variant nameVar = Variant::fromString(nameStr); nameVar.setLocation(nameLoc); - parser->getEvents().commandStart(nameVar, args); + parser->getEvents().command(nameVar, args); } } diff --git a/src/formats/osxml/OsxmlEventParser.hpp b/src/formats/osxml/OsxmlEventParser.hpp index aa20ea9..e39245f 100644 --- a/src/formats/osxml/OsxmlEventParser.hpp +++ b/src/formats/osxml/OsxmlEventParser.hpp @@ -58,34 +58,36 @@ public: * * @param name is a string variant containing name and location of the * command. - * @param args is a map variant containing the arguments that were given - * to the command. + * @param args is a map containing the arguments that were given to the + * command. */ - virtual void commandStart(Variant name, Variant args) = 0; + virtual void command(const Variant &name, const Variant::mapType &args) = 0; /** * Called whenever an annotation starts. Note that this implicitly always * starts the default field of the annotation. * - * @param name is a string variant containing the name of the annotation - * class and the location of the annotation definition. + * @param className is a string variant containing the name of the + * annotation class and the location of the annotation definition. * @param args is a map variant containing the arguments that were given * to the annotation definition. */ - virtual void annotationStart(Variant name, Variant args) = 0; + virtual void annotationStart(const Variant &className, + const Variant::mapType &args) = 0; /** * Called whenever the range of an annotation ends. The callee must * disambiguate the actual annotation that is finished here. * - * @param name is a string variant containing the name of the annotation - * class that should end here. May be empty (or nullptr), if no elementName - * has been specified at the end of the annotation. + * @param className is a string variant containing the name of the + * annotation class that should end here. May be empty (or nullptr), if no + * elementName has been specified at the end of the annotation. * @param elementName is the name of the annotation element that should be * ended here. May be empty (or nullptr), if no elementName has been * specified at the end of the annotation. */ - virtual void annotationEnd(Variant name, Variant elementName) = 0; + virtual void annotationEnd(const Variant &className, + const Variant &elementName) = 0; /** * Called whenever the default field which was implicitly started by @@ -105,7 +107,7 @@ public: * @param data is the already parsed data that should be passed to the * handler. */ - virtual void data(Variant data) = 0; + virtual void data(const Variant &data) = 0; }; /** diff --git a/test/formats/osxml/OsxmlEventParserTest.cpp b/test/formats/osxml/OsxmlEventParserTest.cpp index 06c800f..3293370 100644 --- a/test/formats/osxml/OsxmlEventParserTest.cpp +++ b/test/formats/osxml/OsxmlEventParserTest.cpp @@ -31,7 +31,7 @@ static TerminalLogger logger(std::cerr, true); namespace { enum class OsxmlEvent { - COMMAND_START, + COMMAND, ANNOTATION_START, ANNOTATION_END, FIELD_END, @@ -42,22 +42,24 @@ class TestOsxmlEventListener : public OsxmlEvents { public: std::vector> events; - void commandStart(Variant name, Variant args) override + void command(const Variant &name, const Variant::mapType &args) override { - events.emplace_back(OsxmlEvent::COMMAND_START, + events.emplace_back(OsxmlEvent::COMMAND, Variant::arrayType{name, args}); } - void annotationStart(Variant name, Variant args) override + void annotationStart(const Variant &className, + const Variant::mapType &args) override { events.emplace_back(OsxmlEvent::ANNOTATION_START, - Variant::arrayType{name, args}); + Variant::arrayType{className, args}); } - void annotationEnd(Variant name, Variant elementName) override + void annotationEnd(const Variant &className, + const Variant &elementName) override { events.emplace_back(OsxmlEvent::ANNOTATION_END, - Variant::arrayType{name, elementName}); + Variant::arrayType{className, elementName}); } void fieldEnd() override @@ -65,7 +67,7 @@ public: events.emplace_back(OsxmlEvent::FIELD_END, Variant::arrayType{}); } - void data(Variant data) override + void data(const Variant &data) override { events.emplace_back(OsxmlEvent::DATA, Variant::arrayType{data}); } @@ -91,7 +93,7 @@ TEST(OsxmlEventParser, simpleCommandWithArgs) // 0 1 2 3 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, + {OsxmlEvent::COMMAND, Variant::arrayType{ "a", Variant::mapType{ {"name", "test"}, {"a", 1}, {"b", 2}, {"c", "blub"}}}}, @@ -131,11 +133,9 @@ TEST(OsxmlEventParser, magicTopLevelTag) const char *testString = ""; std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{{"a", Variant::mapType{}}}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}, - {OsxmlEvent::COMMAND_START, - Variant::arrayType{{"b", Variant::mapType{}}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{{"b", Variant::mapType{}}}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; auto events = parseXml(testString); @@ -147,9 +147,8 @@ TEST(OsxmlEventParser, magicTopLevelTagInside) const char *testString = ""; std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{{"a", Variant::mapType{}}}}, - {OsxmlEvent::COMMAND_START, + {OsxmlEvent::COMMAND, Variant::arrayType{{"a", Variant::mapType{}}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{{"ousia", Variant::mapType{}}}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; @@ -165,8 +164,7 @@ TEST(OsxmlEventParser, commandWithDataPreserveWhitespace) // 0 1 2 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, {OsxmlEvent::DATA, Variant::arrayType{" hello \n world "}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; @@ -185,8 +183,7 @@ TEST(OsxmlEventParser, commandWithDataTrimWhitespace) // 0 1 2 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, {OsxmlEvent::DATA, Variant::arrayType{"hello \n world"}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; @@ -205,8 +202,7 @@ TEST(OsxmlEventParser, commandWithDataCollapseWhitespace) // 0 1 2 std::vector> expectedEvents{ - {OsxmlEvent::COMMAND_START, - Variant::arrayType{"a", Variant::mapType{}}}, + {OsxmlEvent::COMMAND, Variant::arrayType{"a", Variant::mapType{}}}, {OsxmlEvent::DATA, Variant::arrayType{"hello world"}}, {OsxmlEvent::FIELD_END, Variant::arrayType{}}}; @@ -217,6 +213,5 @@ TEST(OsxmlEventParser, commandWithDataCollapseWhitespace) ASSERT_EQ(5U, events[1].second.asArray()[0].getLocation().getStart()); ASSERT_EQ(19U, events[1].second.asArray()[0].getLocation().getEnd()); } - } -- cgit v1.2.3 From c298f00ef1633a663775fe9a715a249b9f4d255d Mon Sep 17 00:00:00 2001 From: Andreas Stöckel Date: Sun, 15 Feb 2015 20:58:26 +0100 Subject: Implemented OsxmlParser --- CMakeLists.txt | 2 + src/formats/osxml/OsxmlParser.cpp | 288 +++++++++------------------------ src/formats/osxml/OsxmlParser.hpp | 2 +- test/formats/osxml/OsxmlParserTest.cpp | 28 ++-- 4 files changed, 91 insertions(+), 229 deletions(-) (limited to 'src/formats') diff --git a/CMakeLists.txt b/CMakeLists.txt index 2106cf0..ec1bb4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,6 +203,7 @@ TARGET_LINK_LIBRARIES(ousia_osml ADD_LIBRARY(ousia_osxml src/formats/osxml/OsxmlAttributeLocator src/formats/osxml/OsxmlEventParser + src/formats/osxml/OsxmlParser ) TARGET_LINK_LIBRARIES(ousia_osxml @@ -351,6 +352,7 @@ IF(TEST) ADD_EXECUTABLE(ousia_test_osxml test/formats/osxml/OsxmlEventParserTest + test/formats/osxml/OsxmlParserTest ) TARGET_LINK_LIBRARIES(ousia_test_osxml diff --git a/src/formats/osxml/OsxmlParser.cpp b/src/formats/osxml/OsxmlParser.cpp index 869c76a..c216855 100644 --- a/src/formats/osxml/OsxmlParser.cpp +++ b/src/formats/osxml/OsxmlParser.cpp @@ -16,223 +16,83 @@ along with this program. If not, see . */ -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "XmlParser.hpp" +#include +#include +#include + +#include "OsxmlEventParser.hpp" +#include "OsxmlParser.hpp" namespace ousia { -namespace ParserStates { -/* Document states */ -static const ParserState Document = - ParserStateBuilder() - .parent(&None) - .createdNodeType(&RttiTypes::Document) - .elementHandler(DocumentHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState DocumentChild = - ParserStateBuilder() - .parents({&Document, &DocumentChild}) - .createdNodeTypes({&RttiTypes::StructureNode, - &RttiTypes::AnnotationEntity, - &RttiTypes::DocumentField}) - .elementHandler(DocumentChildHandler::create); - -/* Domain states */ -static const ParserState Domain = ParserStateBuilder() - .parents({&None, &Document}) - .createdNodeType(&RttiTypes::Domain) - .elementHandler(DomainHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainStruct = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::StructuredClass) - .elementHandler(DomainStructHandler::create) - .arguments({Argument::String("name"), - Argument::Cardinality("cardinality", Cardinality::any()), - Argument::Bool("isRoot", false), - Argument::Bool("transparent", false), - Argument::String("isa", "")}); - -static const ParserState DomainAnnotation = - ParserStateBuilder() - .parent(&Domain) - .createdNodeType(&RttiTypes::AnnotationClass) - .elementHandler(DomainAnnotationHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState DomainAttributes = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(DomainAttributesHandler::create) - .arguments({}); - -static const ParserState DomainAttribute = - ParserStateBuilder() - .parent(&DomainAttributes) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState DomainField = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldHandler::create) - .arguments({Argument::String("name", ""), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainFieldRef = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainFieldRefHandler::create) - .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)}); - -static const ParserState DomainStructPrimitive = - ParserStateBuilder() - .parents({&DomainStruct, &DomainAnnotation}) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainPrimitiveHandler::create) - .arguments( - {Argument::String("name", ""), Argument::Bool("isSubtree", false), - Argument::Bool("optional", false), Argument::String("type")}); - -static const ParserState DomainStructChild = - ParserStateBuilder() - .parent(&DomainField) - .elementHandler(DomainChildHandler::create) - .arguments({Argument::String("ref")}); - -static const ParserState DomainStructParent = - ParserStateBuilder() - .parent(&DomainStruct) - .createdNodeType(&RttiTypes::DomainParent) - .elementHandler(DomainParentHandler::create) - .arguments({Argument::String("ref")}); - -static const ParserState DomainStructParentField = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldHandler::create) - .arguments({Argument::String("name", ""), - Argument::Bool("isSubtree", false), - Argument::Bool("optional", false)}); - -static const ParserState DomainStructParentFieldRef = - ParserStateBuilder() - .parent(&DomainStructParent) - .createdNodeType(&RttiTypes::FieldDescriptor) - .elementHandler(DomainParentFieldRefHandler::create) - .arguments({Argument::String("ref", DEFAULT_FIELD_NAME)}); - -/* Typesystem states */ -static const ParserState Typesystem = - ParserStateBuilder() - .parents({&None, &Domain}) - .createdNodeType(&RttiTypes::Typesystem) - .elementHandler(TypesystemHandler::create) - .arguments({Argument::String("name", "")}); - -static const ParserState TypesystemEnum = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::EnumType) - .elementHandler(TypesystemEnumHandler::create) - .arguments({Argument::String("name")}); - -static const ParserState TypesystemEnumEntry = - ParserStateBuilder() - .parent(&TypesystemEnum) - .elementHandler(TypesystemEnumEntryHandler::create) - .arguments({}); - -static const ParserState TypesystemStruct = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::StructType) - .elementHandler(TypesystemStructHandler::create) - .arguments({Argument::String("name"), Argument::String("parent", "")}); - -static const ParserState TypesystemStructField = - ParserStateBuilder() - .parent(&TypesystemStruct) - .elementHandler(TypesystemStructFieldHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("default", Variant::fromObject(nullptr))}); - -static const ParserState TypesystemConstant = - ParserStateBuilder() - .parent(&Typesystem) - .createdNodeType(&RttiTypes::Constant) - .elementHandler(TypesystemConstantHandler::create) - .arguments({Argument::String("name"), Argument::String("type"), - Argument::Any("value")}); - -/* Special states for import and include */ -static const ParserState Import = - ParserStateBuilder() - .parents({&Document, &Typesystem, &Domain}) - .elementHandler(ImportHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const ParserState Include = - ParserStateBuilder() - .parent(&All) - .elementHandler(IncludeHandler::create) - .arguments({Argument::String("rel", ""), Argument::String("type", ""), - Argument::String("src", "")}); - -static const std::multimap XmlStates{ - {"document", &Document}, - {"*", &DocumentChild}, - {"domain", &Domain}, - {"struct", &DomainStruct}, - {"annotation", &DomainAnnotation}, - {"attributes", &DomainAttributes}, - {"attribute", &DomainAttribute}, - {"field", &DomainField}, - {"fieldRef", &DomainFieldRef}, - {"primitive", &DomainStructPrimitive}, - {"childRef", &DomainStructChild}, - {"parentRef", &DomainStructParent}, - {"field", &DomainStructParentField}, - {"fieldRef", &DomainStructParentFieldRef}, - {"typesystem", &Typesystem}, - {"enum", &TypesystemEnum}, - {"entry", &TypesystemEnumEntry}, - {"struct", &TypesystemStruct}, - {"field", &TypesystemStructField}, - {"constant", &TypesystemConstant}, - {"import", &Import}, - {"include", &Include}}; +using namespace parser_stack; + +/** + * Class containing the actual OsxmlParser implementation. + */ +class OsxmlParserImplementation : public OsxmlEvents { +private: + /** + * Actual xml parser -- converts the xml stream into a set of events. + */ + OsxmlEventParser parser; + + /** + * Pushdown automaton responsible for converting the xml events into an + * actual Node tree. + */ + Stack stack; + +public: + /** + * Constructor of the OsxmlParserImplementation class. + * + * @param reader is a reference to the CharReader instance from which the + * XML should be read. + * @param ctx is a reference to the ParserContext instance that should be + * used. + */ + OsxmlParserImplementation(CharReader &reader, ParserContext &ctx) + : parser(reader, *this, ctx.getLogger()), + stack(ctx, GenericParserStates) + { + } + + /** + * Starts the actual parsing process. + */ + void parse() { parser.parse(); } + + void command(const Variant &name, const Variant::mapType &args) override + { + stack.command(name, args); + stack.fieldStart(true); + } + + void annotationStart(const Variant &name, + const Variant::mapType &args) override + { + stack.annotationStart(name, args); + stack.fieldStart(true); + } + + void annotationEnd(const Variant &className, + const Variant &elementName) override + { + stack.annotationEnd(className, elementName); + } + + void fieldEnd() override { stack.fieldEnd(); } + + void data(const Variant &data) override { stack.data(data); } +}; + +/* Class OsxmlParser */ + +void OsxmlParser::doParse(CharReader &reader, ParserContext &ctx) +{ + OsxmlParserImplementation impl(reader, ctx); + impl.parse(); } - - } diff --git a/src/formats/osxml/OsxmlParser.hpp b/src/formats/osxml/OsxmlParser.hpp index 281a49c..0fbf83c 100644 --- a/src/formats/osxml/OsxmlParser.hpp +++ b/src/formats/osxml/OsxmlParser.hpp @@ -17,7 +17,7 @@ */ /** - * @file XmlParser.hpp + * @file OsxmlParser.hpp * * Contains the parser responsible for reading Ousía XML Documents (extension * oxd) and Ousía XML Modules (extension oxm). diff --git a/test/formats/osxml/OsxmlParserTest.cpp b/test/formats/osxml/OsxmlParserTest.cpp index 269a3f6..a2bd8b1 100644 --- a/test/formats/osxml/OsxmlParserTest.cpp +++ b/test/formats/osxml/OsxmlParserTest.cpp @@ -30,7 +30,7 @@ #include #include -#include +#include namespace ousia { @@ -41,7 +41,7 @@ extern const Rtti Typesystem; } struct XmlStandaloneEnvironment : public StandaloneEnvironment { - XmlParser xmlParser; + OsxmlParser parser; FileLocator fileLocator; XmlStandaloneEnvironment(ConcreteLogger &logger) @@ -52,21 +52,21 @@ struct XmlStandaloneEnvironment : public StandaloneEnvironment { registry.registerDefaultExtensions(); registry.registerParser({"text/vnd.ousia.oxm", "text/vnd.ousia.oxd"}, - {&RttiTypes::Node}, &xmlParser); + {&RttiTypes::Node}, &parser); registry.registerResourceLocator(&fileLocator); } }; static TerminalLogger logger(std::cerr, true); -TEST(XmlParser, mismatchedTag) +TEST(OsxmlParser, mismatchedTag) { XmlStandaloneEnvironment env(logger); env.parse("mismatchedTag.oxm", "", "", RttiSet{&RttiTypes::Document}); ASSERT_TRUE(logger.hasError()); } -TEST(XmlParser, generic) +TEST(OsxmlParser, generic) { XmlStandaloneEnvironment env(logger); env.parse("generic.oxm", "", "", RttiSet{&RttiTypes::Node}); @@ -186,7 +186,7 @@ static void checkFieldDescriptor( Handle primitiveType = nullptr, bool optional = false) { auto res = desc->resolve(&RttiTypes::FieldDescriptor, name); - ASSERT_EQ(1, res.size()); + ASSERT_EQ(1U, res.size()); checkFieldDescriptor(res[0].node, name, parent, children, type, primitiveType, optional); } @@ -201,7 +201,7 @@ static void checkFieldDescriptor( optional); } -TEST(XmlParser, domainParsing) +TEST(OsxmlParser, domainParsing) { XmlStandaloneEnvironment env(logger); Rooted book_domain_node = @@ -339,10 +339,10 @@ static void checkText(Handle p, Handle expectedParent, { checkStructuredEntity(p, expectedParent, doc, "paragraph"); Rooted par = p.cast(); - ASSERT_EQ(1, par->getField().size()); + ASSERT_EQ(1U, par->getField().size()); checkStructuredEntity(par->getField()[0], par, doc, "text"); Rooted text = par->getField()[0].cast(); - ASSERT_EQ(1, text->getField().size()); + ASSERT_EQ(1U, text->getField().size()); Handle d = text->getField()[0]; ASSERT_FALSE(d == nullptr); @@ -352,7 +352,7 @@ static void checkText(Handle p, Handle expectedParent, ASSERT_EQ(expected, prim->getContent()); } -TEST(XmlParser, documentParsing) +TEST(OsxmlParser, documentParsing) { XmlStandaloneEnvironment env(logger); Rooted book_document_node = @@ -364,7 +364,7 @@ TEST(XmlParser, documentParsing) checkStructuredEntity(doc->getRoot(), doc, doc, "book"); { Rooted book = doc->getRoot(); - ASSERT_EQ(2, book->getField().size()); + ASSERT_EQ(2U, book->getField().size()); checkText(book->getField()[0], book, doc, "This might be some introductory text or a dedication."); checkStructuredEntity(book->getField()[1], book, doc, "chapter", @@ -372,7 +372,7 @@ TEST(XmlParser, documentParsing) { Rooted chapter = book->getField()[1].cast(); - ASSERT_EQ(3, chapter->getField().size()); + ASSERT_EQ(3U, chapter->getField().size()); checkText(chapter->getField()[0], chapter, doc, "Here we might have an introduction to the chapter."); checkStructuredEntity(chapter->getField()[1], chapter, doc, @@ -381,7 +381,7 @@ TEST(XmlParser, documentParsing) { Rooted section = chapter->getField()[1].cast(); - ASSERT_EQ(1, section->getField().size()); + ASSERT_EQ(1U, section->getField().size()); checkText(section->getField()[0], section, doc, "Here we might find the actual section content."); } @@ -391,7 +391,7 @@ TEST(XmlParser, documentParsing) { Rooted section = chapter->getField()[2].cast(); - ASSERT_EQ(1, section->getField().size()); + ASSERT_EQ(1U, section->getField().size()); checkText(section->getField()[0], section, doc, "Here we might find the actual section content."); } -- cgit v1.2.3