Chunking text

Character based chunking

  • Note: means that words could be split in the middle! Don’t use this.
Python
def chunk_string(s, chunk_length, overlap):
    chunks = []
    start = 0

    while start + chunk_length <= len(s):
        end = start + chunk_length
        chunks.append(s[start:end])
        start += (chunk_length - overlap)

    # Handling the last chunk if the string is not perfectly divisible
    if start < len(s):
        chunks.append(s[start:])

    return chunks

# Example usage
s = "Hello, this is an example string."
chunks = chunk_string(s, 10, 3)

print(len(chunks))
>> 4
print(chunks)
>> 'Hello, thi'
'this is an'
' an exampl'
'mple strin'
'ring.'

Word based chunking

  • Note: the split is simply by space
  • this means sentences can be split! Probably don’t use this.
Python
def chunk_string_by_words(s, words_per_chunk, word_overlap):
    words = s.split()
    chunks = []
    start = 0

    while start + words_per_chunk <= len(words):
        end = start + words_per_chunk
        chunks.append(' '.join(words[start:end]))
        start += (words_per_chunk - word_overlap)

    # Handling the last chunk
    if start < len(words):
        chunks.append(' '.join(words[start:]))

    return chunks

# Example usage
s = "Hello, this is an example string for word based chunking."
chunks = chunk_string_by_words(s, 5, 2)

print(len(chunks))
>> 3
print(chunks)
>> 'Hello, this is an example'
'an example string for word'
'for word based chunking.'

Sentence based chunking

  • Note: the split with re.split(r'(?<=[.!?]) +’, s) isn’t every way to end a sentence.
  • Note: that this method assumes that sentences are always followed by at least one space after the punctuation, which might not always be true, especially with different writing styles or formats.
  • For advance sentence splitting see NLTK’s “sent_tokenize” function (nltk.org)
Python
import re

def chunk_string_by_sentences(s, sentences_per_chunk, sentence_overlap):
    # Splitting the string into sentences
    sentences = re.split(r'(?<=[.!?]) +', s)
    chunks = []
    start = 0

    while start + sentences_per_chunk <= len(sentences):
        end = start + sentences_per_chunk
        chunks.append(' '.join(sentences[start:end]))
        start += (sentences_per_chunk - sentence_overlap)

    # Handling the last chunk
    if start < len(sentences):
        chunks.append(' '.join(sentences[start:]))

    return chunks

# Example usage
s = "Hello. This is an example string. It is used for sentence based chunking. Have a great day!"
chunks = chunk_string_by_sentences(s, 2, 1)
print(chunks)
>> 'Hello. This is an example string.'
'This is an example string. It is used for sentence based chunking.'
'It is used for sentence based chunking. Have a great day!'
'Have a great day!'