Chunking text – Patrick

Character based chunking

Note: means that words could be split in the middle! Don’t use this.

Python

def chunk_string(s, chunk_length, overlap):
    chunks = []
    start = 0

    while start + chunk_length <= len(s):
        end = start + chunk_length
        chunks.append(s[start:end])
        start += (chunk_length - overlap)

    # Handling the last chunk if the string is not perfectly divisible
    if start < len(s):
        chunks.append(s[start:])

    return chunks

# Example usage
s = "Hello, this is an example string."
chunks = chunk_string(s, 10, 3)

print(len(chunks))
>> 4
print(chunks)
>> 'Hello, thi'
'this is an'
' an exampl'
'mple strin'
'ring.'

Word based chunking

Note: the split is simply by space
this means sentences can be split! Probably don’t use this.

Python

def chunk_string_by_words(s, words_per_chunk, word_overlap):
    words = s.split()
    chunks = []
    start = 0

    while start + words_per_chunk <= len(words):
        end = start + words_per_chunk
        chunks.append(' '.join(words[start:end]))
        start += (words_per_chunk - word_overlap)

    # Handling the last chunk
    if start < len(words):
        chunks.append(' '.join(words[start:]))

    return chunks

# Example usage
s = "Hello, this is an example string for word based chunking."
chunks = chunk_string_by_words(s, 5, 2)

print(len(chunks))
>> 3
print(chunks)
>> 'Hello, this is an example'
'an example string for word'
'for word based chunking.'

Sentence based chunking

Note: the split with re.split(r'(?<=[.!?]) +’, s) isn’t every way to end a sentence.
Note: that this method assumes that sentences are always followed by at least one space after the punctuation, which might not always be true, especially with different writing styles or formats.
For advance sentence splitting see NLTK’s “sent_tokenize” function (nltk.org)

Python

import re

def chunk_string_by_sentences(s, sentences_per_chunk, sentence_overlap):
    # Splitting the string into sentences
    sentences = re.split(r'(?<=[.!?]) +', s)
    chunks = []
    start = 0

    while start + sentences_per_chunk <= len(sentences):
        end = start + sentences_per_chunk
        chunks.append(' '.join(sentences[start:end]))
        start += (sentences_per_chunk - sentence_overlap)

    # Handling the last chunk
    if start < len(sentences):
        chunks.append(' '.join(sentences[start:]))

    return chunks

# Example usage
s = "Hello. This is an example string. It is used for sentence based chunking. Have a great day!"
chunks = chunk_string_by_sentences(s, 2, 1)
print(chunks)
>> 'Hello. This is an example string.'
'This is an example string. It is used for sentence based chunking.'
'It is used for sentence based chunking. Have a great day!'
'Have a great day!'