How to use the python rank-bm25 library

Note: this library is called rank-bm25 on pypi (pypi) and NOT bm25

Python
pip install rank-bm25

Official docs (github) say:

Python
from rank_bm25 import BM25Okapi

corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]

tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)
# <rank_bm25.BM25Okapi at 0x1047881d0>


# Ranking Documents
query = "windy London"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
# array([0.        , 0.93729472, 0.        ])


# Get top N
bm25.get_top_n(tokenized_query, corpus, n=1)
# ['It is quite windy in London']

However, you often want to clean the corpus (lowercase, remove punctuation) before indexing the corpus. Once you do that you must keep the original corpus around that index into the original unedited strings.

Python
import string

from rank_bm25 import BM25Okapi


def create_clean_bm25(corpus):
    def clean_text(text):
        # remove punctuation
        text = text.translate(str.maketrans("", "", string.punctuation))
        # lower, strip
        return text.lower().strip()

    # clean
    cleaned_corpus = [clean_text(text) for text in corpus]
    # tokenize
    tokenized_corpus = [doc.split(" ") for doc in cleaned_corpus]
    # create api
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25
    

def search_with_bm25(corpus, query, n=2):
    # create api
    bm25 = create_clean_bm25(corpus)
    
    # tokenize query
    tokenized_query = query.split(" ")
    # rank
    return bm25.get_top_n(tokenized_query, corpus, n=n)
    
    
def your_code():
    corpus = [
        "Hello there good man!",
        "It is quite windy in London",
        "How is the weather today?"
    ]
    query = "windy London"

    bm25_results = search_with_bm25(corpus, query, n=2)

This works for most cases!

Note, if you are doing multiple searches this requires cleaning, tokenizing and indexing a new bm25 every time you search.

Python
def your_code():
    corpus = [
        "Hello there good man!",
        "It is quite windy in London",
        "How is the weather today?"
    ]
    queries = [
        "windy London",
        "weather TODAY"
    ]

    bm25 = create_clean_bm25(corpus)
    
    for query in queries:
        bm25_results = search_with_bm25(corpus, query, n=2)
        print(bm25_results)