/*
Ousía
Copyright (C) 2014, 2015 Benjamin Paaßen, Andreas Stöckel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/**
* @file Document.hpp
*
* This header contains the class hierarchy of actual document classes. A graph
* of connected instances of these nodes is a "Document". How the different
* DocumentEntity instances may be connected within the graph is subject to the
* specification in the respective Domain(s) (see also the Domain.hpp).
*
* A Document, from top to bottom, consists of "Document" instance,
* which "owns" the structural root node of the in-document graph. This might
* for example be a "book" node of the "book" domain. That root node in turn has
* structure nodes as children, which in turn may have children. This
* constitutes a Structure Tree. Additionally annotations may be attached to
* Structure Nodes, effectively resulting in a Document Graph instead of a
* Document Tree (other references may introduce cycles as well).
*
* Consider this simplified XML representation of a document (TODO: Use
* non-simplified XML as soon as possible):
*
*
*
*
*
* This is some text with some emphasized and
* strong text.
*
*
*
*
*
*
*
* As can be seen the StructureEntities inherently follow a tree structure that
* is restricted by the implicit context free grammar of the "book" Domain
* definition (e.g. it is not allowed to have a "book" node inside a "section";
* refer to te Domain.hpp for more information).
*
* Another interesting fact is the special place of AnnotationEntities: They are
* Defined by start and end Anchors in the text. Note that this allows for
* overlapping annotations and provides a more intuitive (and semantically
* sound) handling of such span-like concepts.
* Note that the place of an AnnotationEntity within the XML above is not
* strictly defined. It might as well be placed as a child of the "book" node.
* In general it is recommended to use the lowest possible place in the
* StructureTree to include the AnnotationEntity for better readability.
*
* @author Benjamin Paaßen (bpaassen@techfak.uni-bielefeld.de)
*/
#ifndef _OUSIA_MODEL_DOCUMENT_HPP_
#define _OUSIA_MODEL_DOCUMENT_HPP_
#include
#include
#include "Node.hpp"
#include "Domain.hpp"
#include "Typesystem.hpp"
namespace ousia {
namespace model {
class StructuredEntity;
class AnnotationEntity;
class Document;
/**
* A DocumentEntity is the common superclass for StructuredEntities and
* AnnotationEntities. Similarly to DescriptorEntity in the Domain.hpp it
* defines that each node in the Document graph may have attributes (in form
* of a struct Variant), and fields.
* The fields here are a vector of vectors. The first vector implements all
* fields while the inner vector contains all children in this field.
* We provide, however, convenience functions for better access via the field
* name.
*
*/
class DocumentEntity : public Node {
private:
Owned descriptor;
const Variant attributes;
std::vector> fields;
int getFieldDescriptorIndex(const std::string &fieldName);
public:
DocumentEntity(Manager &mgr, Handle parent,
Handle descriptor, Variant attributes,
std::string name = "")
: Node(mgr, std::move(name), parent),
descriptor(acquire(descriptor)),
attributes(std::move(attributes))
{
// TODO: Validation at construction time?
// insert empty vectors for each field.
if (!descriptor.isNull()) {
for (size_t f = 0; f < descriptor->getFieldDescriptors().size();
f++) {
fields.push_back(NodeVector(this));
}
}
}
Rooted getDescriptor() const { return descriptor; }
Variant getAttributes() const { return attributes; }
/**
* This allows a direct manipulation of the internal data structure of a
* DocumentEntity and is not recommended. TODO: Delete this?
*/
std::vector> &getFields() { return fields; }
/**
* This returns true if there is a FieldDescriptor in the Descriptor for
* this DocumentEntity which has the given name. If an empty name is
* given it is assumed that the 'default' FieldDescriptor is referenced,
* where 'default' means either:
* 1.) The only TREE typed FieldDescriptor (if present) or
* 2.) the only FieldDescriptor (if only one is specified).
*
* @param fieldName is the name of a field as specified in the
* FieldDescriptor in the Domain description.
* @return true if this FieldDescriptor exists.
*/
bool hasField(const std::string &fieldName = "")
{
return getFieldDescriptorIndex(fieldName) != -1;
}
/**
* This returns the vector of entities containing all members of the field
* for which the FieldDescriptor has the specified name. If an empty name is
* given it is assumed that the 'default' FieldDescriptor is referenced,
* where 'default' means either:
* 1.) The only TREE typed FieldDescriptor (if present) or
* 2.) the only FieldDescriptor (if only one is specified).
*
* Note that the output of this method might well be ambigous: If no
* FieldDescriptor matches the given name an empty NodeVector is
* returned. This is also the case, however, if there are no members for an
* existing field. Therefore it is recommended to additionally check the
* output of "hasField" or use the version of this method with
* a FieldDescriptor as input.
*
* @param fieldName is the name of the field as specified in the
* FieldDescriptor in the Domain description.
* @param res is a NodeVector reference where the result will be
* stored. After using this method the reference will
* either refer to all StructuredEntities in that field. If
* the field is unknown or if no members exist in that
* field yet, the NodeVector will be empty.
*/
void getField(NodeVector &res,
const std::string &fieldName = "");
/**
* This returns the vector of entities containing all members of the field
* with the given FieldDescriptor.
*
* If the FieldDescriptor does not belong to the Descriptor of this node
* an exception is thrown.
*
* @param fieldDescriptor is a FieldDescriptor defined in the Descriptor for
* this DocumentEntity.
* @return a NodeVector of all StructuredEntities in that field.
*/
NodeVector &getField(
Rooted fieldDescriptor);
};
/**
* A StructuredEntity is a node in the Structure Tree of a document. For more
* information please refer to the header documentation above.
*/
class StructuredEntity : public DocumentEntity {
private:
NodeVector annotations;
public:
StructuredEntity(Manager &mgr, Handle parent,
Handle descriptor, Variant attributes,
std::string name = "")
: DocumentEntity(mgr, parent, descriptor, std::move(attributes),
std::move(name)),
annotations(this)
{
}
NodeVector &getAnnotations() { return annotations; }
/**
* This builds the root StructuredEntity for the given document. It
* automatically appends the newly build entity to the given document.
*
* @param document is the document this entity shall be build for. The
* resulting entity will automatically be appended to that
* document. Also the manager of that document will be
* used to register the new node.
* @param domains are the domains that are used to find the
* StructuredClass for the new node. The domains will be
* searched in the given order.
* @param className is the name of the StructuredClass.
* @param attributes are the attributes of the new node in terms of a Struct
* variant (empty per default).
* @param name is the name of this StructuredEntity (empty per
* default).
* @return the newly created StructuredEntity or a nullptr if some
* input handle was empty or the given domains did not
* contain a StructuredClass with the given name.
*/
static Rooted buildRootEntity(
Handle document, std::vector> domains,
const std::string &className, Variant attributes = Variant(),
std::string name = "");
/**
* This builds a StructuredEntity as child of the given DocumentEntity. It
* automatically appends the newly build entity to its parent.
*
* @param parent is the parent DocumentEntity. The newly constructed
* StructuredEntity will automatically be appended to it.
* @param domains are the domains that are used to find the
* StructuredClass for the new node. The domains will be
* searched in the given order.
* @param className is the name of the StructuredClass.
* @param fieldName is the name of the field where the newly constructed
* StructuredEntity shall be appended.
* @param attributes are the attributes of the new node in terms of a Struct
* variant (empty per default).
* @param name is the name of this StructuredEntity (empty per
* default).
*
* @return the newly created StructuredEntity or a nullptr if some
* input handle was empty or the given domains did not
* contain a StructuredClass with the given name.
*/
static Rooted buildEntity(
Handle parent, std::vector> domains,
const std::string &className, const std::string &fieldName = "",
Variant attributes = Variant(), std::string name = "");
};
/**
* This is a wrapper for primitive types (Variants) inside the document graph.
* The most straightforward example for this is the actual document text, e.g.
* inside a paragraph. In that case this would represent a mere string.
*/
class DocumentPrimitive : public StructuredEntity {
public:
DocumentPrimitive(Manager &mgr, Handle parent,
Variant content)
: StructuredEntity(mgr, parent, nullptr, std::move(content))
{
}
Variant getContent() const { return getAttributes(); }
// TODO: Override such methods like "getField" to disable them?
/**
* This builds a DocumentPrimitive as child of the given DocumentEntity. It
* automatically appends the newly build entity to its parent.
*
* @param parent is the parent DocumentEntity. The newly constructed
* DocumentPrimitive will automatically be appended to it.
* @param content is the primitive content of the new node in terms of a
* Struct variant.
* @param fieldName is the name of the field where the newly constructed
* StructuredEntity shall be appended.
*
* @return the newly created StructuredEntity or a nullptr if some
* input handle was empty or the given domains did not
* contain a StructuredClass with the given name.
*/
static Rooted buildEntity(
Handle parent, Variant content,
const std::string &fieldName = "");
};
/**
* An AnnotationEntity is a span-like instance that is not bound by the elements
* of the Structure Tree. An annotation may very well overlap and cross the
* limits of StructureEntities. A typical example for AnnotationEntities are
* the markups "emphasized" and "strong". In HTML like markup languages these
* concepts are handeled as structure elements, like this:
*
* emphasized and strong
*
* which is neither intuitive nor semantically sound. Therefore we take the
* approach of anchoring the Annotation entities in the text like this:
*
* emphasized and strong
*
*
*
* Which signifies that indeed the text "emphasized and" is emphasized, not
* the two text exerpts "emphasized" and "and" separately.
*
*/
class AnnotationEntity : public DocumentEntity {
public:
/**
* An Anchor is an elementary StructuredEntity without any children that
* marks a point in the text content of the document that can later be
* referenced by an AnnotationEntity as it start and end point.
* Please refer to the AnnotationEntity documentation for more information.
*/
class Anchor : public StructuredEntity {
public:
/**
* @param mgr is the Manager instance.
* @param name is the Anchor id.
* @param parent is the parent of this Anchor in the Structure Tree (!),
* not the AnnotationEntity that references this Anchor.
*/
Anchor(Manager &mgr, Handle parent,
std::string name = "")
: StructuredEntity(mgr, parent, nullptr, Variant(), std::move(name))
{
}
};
private:
Owned start;
Owned end;
public:
AnnotationEntity(Manager &mgr, Handle parent,
Handle descriptor, Variant attributes,
Handle start, Handle end,
std::string name = "")
: DocumentEntity(mgr, parent, descriptor, attributes, std::move(name)),
start(acquire(start)),
end(acquire(end))
{
}
Rooted getStart() { return start; }
Rooted getEnd() { return end; }
};
/**
* A Document is mainly a wrapper for the Root structure node of the Document
* Graph.
*/
class Document : public Node {
private:
//TODO: Might there be several roots? E.g. metadata?
Owned root;
public:
Document(Manager &mgr, std::string name)
// TODO: Can a document have a parent?
: Node(mgr, std::move(name), nullptr)
{
}
void setRoot(Handle root) { root = acquire(root); };
Rooted getRoot() const { return root; }
};
}
}
#endif /* _OUSIA_MODEL_DOCUMENT_HPP_ */