Source code for steamship.utils.text_chunker
import logging
[docs]
def chunk_text(text: str, chunk_size: int = 200, chunk_overlap: int = 50):
"""Chunk text for embedding and insertion into an embedding index."""
if chunk_size < 1:
logging.warning(f"chunk_size was f{chunk_size}. Setting to 200")
chunk_size = 200
if chunk_overlap < 0:
logging.warning(f"chunk_overlap was f{chunk_overlap}. Setting to 0")
chunk_overlap = 0
if chunk_overlap > chunk_size:
logging.warning(f"chunk_size was f{chunk_size}. Setting to chunk_size - 1 of {chunk_size}")
chunk_overlap = chunk_size - 1 if chunk_size > 1 else 1
step_size = chunk_size - chunk_overlap
for i in range(0, len(text), step_size):
yield text[i : i + chunk_size]