from enum import Enum
from typing import Optional
[docs]
class TagKind(str, Enum):
"""A set of `kind` constants for Tags.
These define broad categories of tags. Suggested `name` values for each category are found in
separate enums. For example: kind=TagKind.DOCUMENT, name=DocTag.H1
"""
PART_OF_SPEECH = "part-of-speech"
DEPENDENCY = "dependency"
SENTIMENT = "sentiment"
EMOTION = "emotion"
ENTITY = "entity"
DOCUMENT = "document"
TOKEN = "token" # noqa: S105
INTENT = "intent"
EMBEDDING = "embedding"
GENERATION = "generation"
PROVENANCE = "provenance"
TOPIC = "topic"
TOKENIZATION = "tokenization"
KIND = "summary"
TIMESTAMP = "timestamp"
SUMMARY = "summary"
SEARCH_RESULT = "search-result"
ROLE = "role"
CHAT = "chat"
CHAT_HISTORY_CONTEXT = "chat-history-context"
MESSAGE_ID = "message-id"
STATUS_MESSAGE = "status-message"
AGENT_STATUS_MESSAGE = "agent-status-message"
TOOL_STATUS_MESSAGE = "tool-status-message"
LLM_STATUS_MESSAGE = "llm-status-message"
FUNCTION_ARG = "function-arg"
FUNCTION_SELECTION = "function-selection"
[docs]
class DocTag(str, Enum):
"""A set of `name` constants for Tags with a `kind` of `TagKind.doc`; appropriate for HTML and Markdown ideas."""
DOCUMENT = "document"
PAGE = "page" # E.g. in a PDF
REGION = "region" # E.g., abstract catchall region in a document
HEADER = "header"
H1 = "h1"
H2 = "h2"
H3 = "h3"
H4 = "h4"
H5 = "h5"
LINE = "line"
TITLE = "title"
SOURCE = "source"
SUBTITLE = "subtitle"
FOOTER = "footer"
PARAGRAPH = "paragraph"
ORDERED_LIST = "ordered-list"
UNORDERED_LIST = "unordered-list"
LIST_ITEM = "list-item"
LINK = "link"
CAPTION = "caption"
IMAGE = "image"
BLOCK_QUOTE = "block-quote"
BLOCK_CODE = "block-code"
UNKNOWN = "unknown"
SENTENCE = "sentence"
TOKEN = "token" # noqa: S105
SPAN = "span"
DIV = "div"
PRE = "pre"
STRONG = "strong"
EMPHASIZED = "emphasized"
UNDERLINED = "underlined"
TELETYPE = "teletype"
ARTICLE = "article"
MAIN = "main"
CHAPTER = "chapter"
TEXT = "text"
CHAT = "chat"
METADATA = "metadata"
[docs]
@staticmethod
def from_html_tag(tagname: Optional[str]) -> Optional["DocTag"]: # noqa: C901
if tagname is None:
return None
name = tagname.lower().strip()
if name == "p":
return DocTag.PARAGRAPH
elif name == "h1":
return DocTag.H1
elif name == "h2":
return DocTag.H2
elif name == "h3":
return DocTag.H3
elif name == "h4":
return DocTag.H4
elif name == "h5":
return DocTag.H5
elif name == "ul":
return DocTag.UNORDERED_LIST
elif name == "ol":
return DocTag.ORDERED_LIST
elif name == "li":
return DocTag.LIST_ITEM
elif name == "a":
return DocTag.LINK
elif name == "div":
return DocTag.DIV
elif name == "img":
return DocTag.IMAGE
elif name == "span":
return DocTag.SPAN
elif name == "pre":
return DocTag.PRE
elif name == "code":
return DocTag.BLOCK_CODE
elif name == "blockquote":
return DocTag.BLOCK_QUOTE
elif name == "strong":
return DocTag.STRONG
elif name == "b":
return DocTag.STRONG
elif name == "emph":
return DocTag.EMPHASIZED
elif name == "i":
return DocTag.EMPHASIZED
elif name == "u":
return DocTag.UNDERLINED
elif name == "tt":
return DocTag.TELETYPE
elif name == "article":
return DocTag.ARTICLE
elif name == "header":
return DocTag.HEADER
elif name == "footer":
return DocTag.FOOTER
elif name == "main":
return DocTag.MAIN
return None
[docs]
class TokenTag(str, Enum):
"""A set of `name` constants for Tags with a `kind` of `TagKind.token`; appropriate for parsing-level ideas."""
TEXT_WITH_WHITESPACE = "text-with-whitespace"
TEXT = "text"
WHITESPACE = "whitespace"
HEAD = "head"
LEFT_EDGE = "left-edge"
RIGHT_EDGE = "right-edge"
ENTITY_TYPE = "entity-type"
ENTITY_IOB = "entity-iob"
LEMMA = "lemma"
NORMALIZED = "normalized"
SHAPE = "shape"
PREFIX = "prefix"
SUFFIX = "suffix"
IS_ALPHA = "is-alpha"
IS_ASCII = "is-ascii"
IS_DIGIT = "is-digit"
IS_TITLE = "is-title"
IS_PUNCT = "is-punct"
IS_LEFT_PUNCT = "is-left-punct"
IS_RIGHT_PUNCT = "is-right-punct"
IS_SPACE = "is-space"
IS_BRACKET = "is-bracket"
IS_QUOTE = "is-quote"
IS_CURRENCY = "is-currency"
LIKE_URL = "like-url"
LIKE_NUM = "like-num"
LIKE_EMAIL = "like-email"
IS_OUT_OF_VOCABULARY = "is-out-of-vocabulary"
IS_STOPWORD = "is-stopword"
LANGUAGE = "language"
[docs]
class TagValueKey(str, Enum):
"""A set of key constants for the `value` object within a tag."""
# Catch-all for confidence, score, ranking
SCORE = "score"
# Catch-all for values of different types such as integers, floats, booleans, and strings
VALUE = "value"
# An array of floats or integers
VECTOR_VALUE = "vector-value"
# A float or integer
NUMBER_VALUE = "number-value"
# A bool
BOOL_VALUE = "bool-value"
# A string
STRING_VALUE = "string-value"
# Whether some annotation is direct ("Susan said 'Hi'")
DIRECT = "direct"
# Start time of a region of a document, in some other medium (seconds)
START_TIME_S = "start-time-s"
# End time of a region of a document, in some other medium (seconds)
END_TIME_S = "end-time-s"
# The normalized name of an entity
ENTITY_NAME = "entity_name"
# Timestamp. Can be used to provide a time-based sort-ordering for tags.
TIMESTAMP_VALUE = "timestamp-value"
[docs]
class GenerationTag(str, Enum):
"""A set of `name` constants for Tags with a `kind` of `TagKind.generation`."""
# A generated summary of some region of a document
SUMMARY = "summary"
# A generated headline for some region of a document
HEADLINE = "headline"
# A generated "micro summary" of some region of a document
GIST = "gist"
# A generated completion using some region of the document as input
PROMPT_COMPLETION = "prompt-completion"
[docs]
class ProvenanceTag(str, Enum):
"""A set of `name` constants for Tags with a `kind` of `TagKind.provenance`."""
# The speaker of a section of a document
SPEAKER = "speaker"
# The URL from which some section of a document was sourced
URL = "url"
# The File from which some section of a document was sourced
FILE = "file"
[docs]
class RoleTag(str, Enum):
"""A set of `name` constants for Tags with a `kind` of `TagKind.ROLE`."""
# This block's content was created by the System; likely instructional text on how to respond
SYSTEM = "system"
# This block's content was created by an end user
USER = "user"
# This block's content was created by the generative AI assistant
ASSISTANT = "assistant"
# This block's content was created by a non-human agent. This will be omitted from prompt history, etc.
# They are meant to carry internal status messages.
AGENT = "agent"
# This block was created by a Tool, as selected by an OpenAI Function call
FUNCTION = "function"
# This block's content was created by a tool. This will be omitted from prompt history, etc.
# They are meant to carry internal status messages.
TOOL = "tool"
# This block's content was created by an LLM. This will be omitted from prompt history, etc.
# They are meant to carry internal status messages.
LLM = "llm"
[docs]
class ChatTag(str, Enum):
"""A set of `name` constants for Tags with a `kind` of `TagKind.CHAT`."""
# The chat id in which a message happened
CHAT_ID = "chat-id"
# The message id of a message
MESSAGE_ID = "message-id"
# In environments which support threading, the thread id where the message occurred
THREAD_ID = "thread-id"
# In multiuser environments, the ID of the user who created the message
USER_ID = "user-id"
# The role of a message
ROLE = "role"
# The keys to look up a context
CONTEXT_KEYS = "context-keys"
# The handle of an embedding index
INDEX_HANDLE = "index-handle"
# A chunk of text for indexing
CHUNK = "chunk"
# A chat history should be marked as kind=CHAT/name=HISTORY
HISTORY = "history"
# A message should be marked as kind=CHAT/name=MESSAGE
MESSAGE = "message"
# Used to signal that an individual request was considered complete
REQUEST_COMPLETE = "request-complete"