Source code for steamship.agents.schema.text_splitters

from abc import ABC, abstractmethod
from typing import List

from steamship import Block, Tag


[docs] class TextSplitter(ABC):
[docs] @abstractmethod def split_text(self, text: str) -> List[str]: """Split the incoming text into strings""" raise NotImplementedError()
[docs] def chunk_text_to_tags(self, block: Block, kind: str, name: str = None) -> List[Tag]: """Split the incoming text into strings, and then wrap those strings in Tags""" if block.is_text() and block.text is not None and block.text.strip() != "": text_splits = self.split_text(block.text) start_index = 0 result = [] for text_split in text_splits: tag = Tag.create( client=block.client, file_id=block.file_id, block_id=block.id, kind=kind, name=name, start_idx=start_index, end_idx=start_index + len(text_split), ) tag.text = text_split result.append(tag) start_index += len(text_split) return result else: return []
[docs] class FixedSizeTextSplitter(TextSplitter): """Simplest possible chunking strategy; every n characters.""" chunk_size: int def __init__(self, chunk_size): self.chunk_size = chunk_size
[docs] def split_text(self, text: str) -> List[str]: result = [] for i in range(int(len(text) / self.chunk_size) + 1): start = i * self.chunk_size end = min((i + 1) * self.chunk_size, len(text)) result.append(text[start:end]) return result