/* Ousía Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /** * @file Document.hpp * * This header contains the class hierarchy of actual document classes. A graph * of connected instances of these nodes is a "Document". How the different * DocumentEntity instances may be connected within the graph is subject to the * specification in the respective Domain(s) (see also the Domain.hpp). * * A Document, from top to bottom, consists of "Document" instance, * which "owns" the structural root node of the in-document graph. This might * for example be a "book" node of the "book" domain. That root node in turn has * structure nodes as children, which in turn may have children. This * constitutes a Structure Tree. Additionally annotations may be attached to * Structure Nodes, effectively resulting in a Document Graph instead of a * Document Tree (other references may introduce cycles as well). * * Consider this XML representation of a document using the "book" domain: * * \code{.xml} * * * * * * * * This might be some introductory text or a dedication. Ideally, of * course, such elements would be semantically specified as such in * additional domains (or in this one). * * Here we might have an introduction to the chapter, including some * overview of the chapters structure. *
* Here we might find the actual section content. *
*
* Here we might find the actual section content. * * * And there might even be another paragraph. *
*
*
*
* \endcode * * As can be seen the StructureEntities inherently follow a tree structure that * is restricted by the implicit context free grammar of the "book" Domain * definition (e.g. it is not allowed to have a "book" node inside a "section"; * refer to te Domain.hpp for more information). * * Another interesting fact is the special place of AnnotationEntities: They are * Defined by start and end Anchors in the text. Note that this allows for * overlapping annotations and provides a more intuitive (and semantically * sound) handling of such span-like concepts. So the * * \code{.xml} * content * \endcode * * is implicitly expanded to: * * \code{.xml} * content * * \endcode * * Note that the place of an AnnotationEntity within the XML above is not * strictly defined. It might as well be placed as a child of the "book" node. * In general it is recommended to use the lowest possible place in the * StructureTree to include the AnnotationEntity for better readability. * * Also note that text content like * * Here we might find the actual section content. * * is implicitly expanded using transparency to: * * \code{.xml} * * * Here we might find the actual section content. * * * \endcode * * @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de) */ #ifndef _OUSIA_MODEL_DOCUMENT_HPP_ #define _OUSIA_MODEL_DOCUMENT_HPP_ #include #include #include #include "Node.hpp" #include "Domain.hpp" #include "Typesystem.hpp" namespace ousia { // Forward declarations class Rtti; class Document; class StructureNode; class StructuredEntity; class DocumentPrimitive; class Anchor; /** * A DocumentEntity is the common superclass for StructuredEntities and * AnnotationEntities. Similarly to DescriptorEntity in the Domain.hpp it * defines that each node in the Document graph may have attributes (in form * of a struct Variant), and fields. * The fields here are a vector of vectors. The first vector implements all * fields while the inner vector contains all children in this field. * We provide, however, convenience functions for better access via the field * name. * */ class DocumentEntity { private: /* * this is a rather dirty method that should not be used in other cases: * We store a handle to the Node instance that inherits from * DocumentEntity. This Handle is not registered and would lead to Segfaults * if we could not garantuee that it lives exactly as long as this * DocumentEntity because the handle is for the subclass instance. */ Handle subInst; Owned descriptor; Variant attributes; std::vector> fields; int getFieldDescriptorIndex(const std::string &fieldName, bool enforce) const; int getFieldDescriptorIndex(Handle fieldDescriptor, bool enforce) const; void invalidateSubInstance(); void addStructureNode(Handle s, const int &i); bool removeStructureNodeFromField(Handle s, const int &i); protected: bool doValidate(Logger &logger) const; public: /** * The constructor for a DocumentEntity. Node that this does not inherit * from Node. Therefore we need to have a handle to the subclass Node * instance to create NodeVectors and Owned references. * * @param subInst is a handle to the subclass instance * (e.g. StructuredEntity), such that the fields vectors * and the descriptor reference can be obtained. * @param descriptor is the Descriptor for this DocumentEntity, which will * transformed to an Owned reference of the given owner. * @param attributes is a Map Variant adhering to the attribute StructType * in the given descriptor. */ DocumentEntity(Handle subInst, Handle descriptor, Variant attributes = {}); /** * Returns the Descriptor for this DocumentEntity. * * @return the Descriptor for this DocumentEntity. */ Rooted getDescriptor() const { return descriptor; } /** * Sets the Descriptor for this DocumentEntity. * * @param d is the new Descriptor for this DocumentEntity. */ void setDescriptor(Handle d); /** * Returns a Map Variant adhering to the attribute StructType in the given * descriptor. * * @return a Map Variant adhering to the attribute StructType in the given * descriptor. */ Variant getAttributes() const { return attributes; } /** * Sets the attributes for this DocumentEntity. Attributes are set as a Map * variant. * * @param a is a Map variant containing the attributes for this * DocumentEntity. */ void setAttributes(const Variant &a); /** * This returns true if there is a FieldDescriptor in the Descriptor for * this DocumentEntity which has the given name. If an empty name is * given it is assumed that the 'default' FieldDescriptor is referenced, * where 'default' means either: * 1.) The only TREE typed FieldDescriptor (if present) or * 2.) the only FieldDescriptor (if only one is specified). * * @param fieldName is the name of a field as specified in the * FieldDescriptor in the Domain description. * @return true if this FieldDescriptor exists. */ bool hasField(const std::string &fieldName = "") const { return getFieldDescriptorIndex(fieldName, false) != -1; } /** * This returns the vector of entities containing all members of the field * with the given name. If an empty name is given it is assumed that the * 'default' FieldDescriptor is referenced, where 'default' means either: * 1.) The only TREE typed FieldDescriptor (if present) or * 2.) the only FieldDescriptor (if only one is specified). * * If the name is unknown an exception is thrown. * * @param fieldName is the name of a field as specified in the * FieldDescriptor in the Domain description. * @return a NodeVector of all StructuredEntities in that field. */ const NodeVector &getField( const std::string &fieldName = "") const { return fields[getFieldDescriptorIndex(fieldName, true)]; } /** * This returns the vector of entities containing all members of the field * with the given FieldDescriptor. * * If the FieldDescriptor does not belong to the Descriptor of this node * an exception is thrown. * * @param fieldDescriptor is a FieldDescriptor defined in the Descriptor for * this DocumentEntity. * @return a NodeVector of all StructuredEntities in that * field. */ const NodeVector &getField( Handle fieldDescriptor) const { return fields[getFieldDescriptorIndex(fieldDescriptor, true)]; } /** * This adds a StructureNode to the field with the given name. If an * empty name is given it is assumed that the 'default' FieldDescriptor is * referenced, where 'default' means either: * 1.) The only TREE typed FieldDescriptor (if present) or * 2.) the only FieldDescriptor (if only one is specified). * * If the name is unknown an exception is thrown. * * This method also changes the parent of the newly added StructureNode if * it is not set to this DocumentEntity already and removes it from the * old parent. * * @param s is the StructureNode that shall be added. * @param fieldName is the name of a field as specified in the * FieldDescriptor in the Domain description. */ void addStructureNode(Handle s, const std::string &fieldName = ""); /** * This adds multiple StructureNodes to the field with the given name. * If an empty name is given it is assumed that the 'default' * FieldDescriptor is referenced, where 'default' means either: * 1.) The only TREE typed FieldDescriptor (if present) or * 2.) the only FieldDescriptor (if only one is specified). * * If the name is unknown an exception is thrown. * * This method also changes the parent of each newly added StructureNode if * it is not set to this DocumentEntity already and removes it from the * old parent. * * @param ss are the StructureNodes that shall be added. * @param fieldName is the name of a field as specified in the * FieldDescriptor in the Domain description. */ void addStructureNodes(const std::vector> &ss, const std::string &fieldName = ""); /** * This removes a StructureNode from the field with the given name. If an * empty name is given it is assumed that the 'default' FieldDescriptor is * referenced, where 'default' means either: * 1.) The only TREE typed FieldDescriptor (if present) or * 2.) the only FieldDescriptor (if only one is specified). * * If the name is unknown an exception is thrown. * * This method also changes the parent of the removed StructureNode to null. * * @param s is the StructureNode that shall be removed. * @param fieldName is the name of a field as specified in the * FieldDescriptor in the Domain description. * @return true if this StructureNode was a child here and false if * if was not found. */ bool removeStructureNodeFromField(Handle s, const std::string &fieldName = ""); /** * This adds a StructureNode to the field with the given FieldDescriptor. * * If the FieldDescriptor does not belong to the Descriptor of this node * an exception is thrown. * * This method also changes the parent of the newly added StructureNode if * it is not set to this DocumentEntity already and removes it from the * old parent. * * @param s is the StructureNode that shall be added. * @param fieldDescriptor is a FieldDescriptor defined in the Descriptor for * this DocumentEntity. */ void addStructureNode(Handle s, Handle fieldDescriptor); /** * This adds multiple StructureNodes to the field with the given * FieldDescriptor. * * If the FieldDescriptor does not belong to the Descriptor of this node * an exception is thrown. * * This method also changes the parent of each newly added StructureNode if * it is not set to this DocumentEntity already and removes it from the * old parent. * * @param ss are the StructureNodes that shall be added. * @param fieldDescriptor is a FieldDescriptor defined in the Descriptor for * this DocumentEntity. */ void addStructureNodes(const std::vector> &ss, Handle fieldDescriptor); /** * This removes a StructureNode from the field with the given * FieldDescriptor. * * This method also changes the parent of the removed StructureNode to null. * * @param s is the StructureNode that shall be removed. * @param fieldDescriptor is a FieldDescriptor defined in the Descriptor for * this DocumentEntity. * @return true if this StructureNode was a child here and false if * if was not found. */ bool removeStructureNodeFromField(Handle s, Handle fieldDescriptor); /** * This removes a StructureNode from this DocumentEntity. It iterates * through all fields to find it. * * This method also changes the parent of the removed StructureNode to null. * * @param s is the StructureNode that shall be removed. * @return true if this StructureNode was a child here and false if if was * not found. */ bool removeStructureNode(Handle s); /** * This creates a new StructuredEntity as child of this DocumentEntity. * * @param descriptor is the StructuredClass of this StructuredEntity. * @param attributes is a Map Variant containing attribute fillings for this * StructuredEntity. It is empty per default. * @param fieldName is the name of the field, where the newly created * StructuredEntity shall be added to this DocumentEntity. * @param name is some name for this StructuredEntity that may be used * for later reference. It is empty per default. * * @return the newly created StructuredEntity. */ Rooted createChildStructuredEntity( Handle descriptor, Variant attributes = {}, const std::string &fieldName = "", std::string name = ""); /* * Creates a new DocumentPrimitive as child of this DocumentEntity. * * @param content is a Variant containing the content of this * DocumentPrimitive. The Type of this Variant is * specified at the parents Descriptor for the given * fieldName. * @param fieldName is the name of the field, where the newly created * StructuredEntity shall be added to this DocumentEntity. * * @return the newly created DocumentPrimitive. */ Rooted createChildDocumentPrimitive( Variant content = {}, const std::string &fieldName = ""); /** * Creates a new Anchor as child of this DocumentEntity. * * @param name is the Anchor id. * @param fieldName is the name of the field, where the newly created * Anchor shall be added to this DocumentEntity. * * @return the newly created Anchor. */ Rooted createChildAnchor(std::string name, const std::string &fieldName = ""); }; /** * A StructureNode is a Node of the StructureTree of the document. This is a * common superclass for StructuredEntity, Anchor and DocumentPrimitive. */ class StructureNode : public Node { friend DocumentEntity; protected: bool doValidate(Logger &logger) const override; public: /** * Constructor for a StructureNode in the StructureTree. */ StructureNode(Manager &mgr, std::string name, Handle parent, const std::string &fieldName); /** * Constructor for an empty StructureNode. */ StructureNode(Manager &mgr, std::string name = "", Handle parent = nullptr) : Node(mgr, std::move(name), parent) { } }; /** * A StructuredEntity is an instance of a StructuredClass. For more * information please refer to the header documentation above. */ class StructuredEntity : public StructureNode, public DocumentEntity { friend Document; protected: bool doValidate(Logger &logger) const override; public: /** * Constructor for a StructuredEntity in the Structure Tree. * * @param mgr is the Manager instance. * @param parent is the parent DocumentEntity of this StructuredEntity * in the DocumentTree. Note that this StructuredEntity * will automatically register itself as child of this * parent. * @param descriptor is the StructuredClass of this StructuredEntity. * @param attributes is a Map Variant containing attribute fillings for this * StructuredEntity. It is empty per default. * @param fieldName is the name of the field in the parent DocumentEntity * where this StructuredEntity shall be added. It is empty * per default, referring to the default field. * @param name is some name for this StructuredEntity that may be used * for later reference. It is empty per default. */ StructuredEntity(Manager &mgr, Handle parent, Handle descriptor, Variant attributes = {}, const std::string &fieldName = "", std::string name = "") : StructureNode(mgr, std::move(name), parent, fieldName), DocumentEntity(this, descriptor, std::move(attributes)) { } /** * Constructor for a StructuredEntity at the document root. * * @param mgr is the Manager instance. * @param parent is the parent Document of this StructuredEntity. Note * that this StructuredEntity will automatically register * itself as child of this Document. * @param descriptor is the StructuredClass of this StructuredEntity. * @param attributes is a Map Variant containing attribute fillings for this * StructuredEntity. It is empty per default. * @param name is some name for this StructuredEntity that may be used * for later reference. It is empty per default. */ StructuredEntity(Manager &mgr, Handle doc, Handle descriptor, Variant attributes = {}, std::string name = ""); /** * Constructor for an empty StructuredEntity that is not yet connected. * * @param mgr is the Manager instance. * @param parent is the parent Document of this StructuredEntity. Note * that this StructuredEntity will automatically register * itself as child of this Document. * @param descriptor is the StructuredClass of this StructuredEntity. * @param attributes is a Map Variant containing attribute fillings for this * StructuredEntity. It is empty per default. * @param name is some name for this StructuredEntity that may be used * for later reference. It is empty per default. */ StructuredEntity(Manager &mgr, Handle parent = nullptr, Handle descriptor = nullptr, Variant attributes = {}, std::string name = ""); }; /** * This is a wrapper for primitive types (Variants) inside the document graph. * The most straightforward example for this is the actual document text, e.g. * inside a paragraph. In that case this would represent a mere string. */ class DocumentPrimitive : public StructureNode { private: Variant content; public: /** * Constructor for a DocumentPrimitive. * * @param mgr is the Manager instance. * @param parent is the parent DocumentEntity of this DocumentPrimitive * in the DocumentTree. Note that this DocumentPrimitive * will automatically register itself as child of this * parent. * @param content is a Variant containing the content of this * DocumentPrimitive. The Type of this Variant is * specified at the parents Descriptor for the given * fieldName. * @param fieldName is the name of the field in the parent DocumentEntity * where this DocumentPrimitive shall be added. It is empty * per default, referring to the default field. */ DocumentPrimitive(Manager &mgr, Handle parent, Variant content = {}, const std::string &fieldName = "") : StructureNode(mgr, "", parent, fieldName), content(content) { } /** * Returns the content of this DocumentPrimitive. * * @return the content of this DocumentPrimitive. */ Variant getContent() const { return content; } /** * Sets the content of this DocumentPrimitive to the given Variant. * * @param c is the new content of this DocumentPrimitive. */ void setContent(const Variant &c) { invalidate(); content = c; } }; /** * An Anchor is an elementary StructureNode without any children that * marks a point in the text content of the document that can later be * referenced by an AnnotationEntity as it start and end point. * Please refer to the AnnotationEntity documentation for more information. */ class Anchor : public StructureNode { protected: bool doValidate(Logger &logger) const override; public: /** * Constructor for Anchor. * * @param mgr is the Manager instance. * @param parent is the parent of this Anchor in the Structure Tree (!), * not the AnnotationEntity that references this Anchor. * Note that this Anchor will automatically register itself * as child of the given parent. * @param name is the Anchor id. * @param fieldName is the name of the field in the parent DocumentEntity * where this Anchor shall be added. It is empty * per default, referring to the default field. */ Anchor(Manager &mgr, std::string name, Handle parent, const std::string &fieldName = "") : StructureNode(mgr, std::move(name), parent, fieldName) { } }; /** * An AnnotationEntity is a span-like instance that is not bound by the elements * of the Structure Tree. An annotation may very well overlap and cross the * limits of StructureEntities. A typical example for AnnotationEntities are * the markups "emphasized" and "strong". In HTML like markup languages these * concepts are handeled as structure elements, like this: * * \code{.xml} * emphasized and strong * \endcode * * which is neither intuitive nor semantically sound. Therefore we take the * approach of anchoring the Annotation entities in the text like this: * * \code{.xml} * emphasized and strong * * * \endcode * * Which signifies that indeed the text "emphasized and" is emphasized, not * the two text exerpts "emphasized" and "and" separately. * */ class AnnotationEntity : public Node, public DocumentEntity { friend DocumentEntity; friend Document; private: Owned start; Owned end; protected: bool doValidate(Logger &logger) const override; public: /** * The constructor for an AnnotationEntity. * * @param mgr is the Manager instance. * @param parent is the Document this AnnotationEntity is part of. The * constructor will automatically register this * AnnotationEntity at that document. * @param descriptor is the AnnotationClass of this AnnotationEntity. * @param start is the start Anchor of this AnnotationEntity. It has to * be part of the Document given as parent. * @param end is the end Anchor of this Annotationentity. It has to * be part of the Document given as parent. * @param attributes is a Map Variant containing attribute fillings for this * AnnotationEntity. It is empty per default. * @param name is some name for this AnnotationEntity that might be * used for references later on. It is empty per default. */ AnnotationEntity(Manager &mgr, Handle parent = nullptr, Handle descriptor = nullptr, Handle start = nullptr, Handle end = nullptr, Variant attributes = {}, std::string name = ""); /** * Returns the start Anchor of this AnnotationEntity. * * @return the start Anchor of this AnnotationEntity. */ Rooted getStart() const { return start; } /** * Returns the end Anchor of this AnnotationEntity. * * @return the end Anchor of this AnnotationEntity. */ Rooted getEnd() const { return end; } /** * Sets the start Anchor of this AnnotationEntity. * * @param s is the new start Anchor for this AnnotationEntity. */ void setStart(Handle s) { invalidate(); start = acquire(s); } /** * Sets the end Anchor of this AnnotationEntity. * * @param e is the new end Anchor for this AnnotationEntity. */ void setEnd(Handle e) { invalidate(); end = acquire(e); } }; /** * A Document is mainly a wrapper for the Root structure node of the Document * Graph. It also references the domains that have been used within this * document and the AnnotationEntities that span over Anchors in this Document. */ class Document : public Node { private: // TODO: Might there be several roots? E.g. metadata? Owned root; NodeVector annotations; NodeVector domains; void doResolve(ResolutionState &state) override; protected: bool doValidate(Logger &logger) const override; public: /** * This sets up an empty document. * * @param mgr is the Manager instance. * @param name is a name for this Document. */ Document(Manager &mgr, std::string name) : Node(mgr, std::move(name), nullptr), annotations(this) { } /** * This sets up an empty document. * * @param mgr is the Manager instance. * @param name is a name for this Document. */ static Rooted createEmptyDocument(Manager &mgr, std::string name) { return Rooted{new Document(mgr, std::move(name))}; } /** * Sets the root StructuredEntity of this Document. This also sets the * parent of the given StructuredEntity if it is not set to this Document * already. */ void setRoot(Handle root) { invalidate(); this->root = acquire(root); if (root->getParent() != this) { root->setParent(this); } }; /** * Returns the root StructuredEntity of this Document. * * @return the root StructuredEntity of this Document. */ Rooted getRoot() const { return root; } /** * This creates a new StructuredEntity and adds it as root to this Document. * * @param descriptor is the StructuredClass of this StructuredEntity. * @param attributes is a Map Variant containing attribute fillings for this * StructuredEntity. It is empty per default. * @param name is some name for this StructuredEntity that may be used * for later reference. It is empty per default. * * @return the newly constructed StructuredEntity. */ Rooted createRootStructuredEntity( Handle descriptor, Variant attributes = {}, std::string name = ""); /** * Returns a const reference to the NodeVector of AnnotationEntities that * span over Anchors in this Documents structure. * * @return a const reference to the NodeVector of AnnotationEntities that * span over Anchors in this Documents structure. */ const NodeVector &getAnnotations() const { return annotations; } /** * Adds an AnnotationEntity to this Document. This also sets the parent * of the given AnnotationEntity if it is not set to this Document already * and removes it from the old Document. * * @param a is some AnnotationEntity */ void addAnnotation(Handle a); /** * Adds multiple AnnotationEntities to this Document. This also sets the * parent of each given AnnotationEntity if it is not set to this Document * already and removes it from the old Document. * * @param as is a vector of AnnotationEntities. */ void addAnnotations(const std::vector> &as); /** * Removes an AnnotationEntity from this Document. This also sets the parent * of the given AnnotationEntity to null. * * @param a is some AnnotationEntity. * @return true if the given AnnotationEntity was removed and false if this * Document did not have the given AnnotationEntity as child. */ bool removeAnnotation(Handle a); /** * Creates a new AnnotationEntity as child of this Document. * * @param descriptor is the AnnotationClass of this AnnotationEntity. * @param start is the start Anchor of this AnnotationEntity. It has to * be part of this Document. * @param end is the end Anchor of this Annotationentity. It has to * be part of this Document. * @param attributes is a Map Variant containing attribute fillings for this * AnnotationEntity. It is empty per default. * @param name is some name for this AnnotationEntity that might be * used for references later on. It is empty per default. * * @return the newly constructed AnnotationEntity. */ Rooted createChildAnnotation( Handle descriptor, Handle start, Handle end, Variant attributes = {}, std::string name = ""); /** * Returns a const reference to the NodeVector of Domains that are used * within this Document. * * @return a const reference to the NodeVector of Domains that are used * within this Document. */ const NodeVector &getDomains() const { return domains; } /** * Adds a Domain reference to this Document. */ void addDomain(Handle d) { invalidate(); domains.push_back(d); } /** * Adds multiple Domain references to this Document. */ void addDomains(const std::vector> &d) { invalidate(); domains.insert(domains.end(), d.begin(), d.end()); } /** * Returns true if and only if the given StructureNode is part of this * document, meaning that there is a path of parent references in the * Structure Tree leading from the given StructureNode to this Document. * * @param s is some StructureNode. * @return true if and only if the given StructureNode is part of this * document. */ bool hasChild(Handle s) const; }; namespace RttiTypes { extern const Rtti Document; extern const Rtti DocumentEntity; extern const Rtti AnnotationEntity; extern const Rtti StructureNode; extern const Rtti StructuredEntity; extern const Rtti DocumentPrimitive; extern const Rtti Anchor; } } #endif /* _OUSIA_MODEL_DOCUMENT_HPP_ */