22 lines
738 B
Python
22 lines
738 B
Python
from folkugat_web.config import search as config
|
|
from folkugat_web.model.search import NGrams
|
|
from folkugat_web.utils import groupby
|
|
|
|
|
|
def get_all_ngrams(text: str) -> list[tuple[int, str]]:
|
|
return [(m, text[i:i+m])
|
|
for m in range(config.MIN_NGRAM_LENGTH, len(text) + 1)
|
|
for i in range(len(text) - m + 1)
|
|
if m > 0]
|
|
|
|
|
|
def get_text_ngrams(*texts: str) -> NGrams:
|
|
lower_texts = [word.lower() for text in texts for word in text.split()]
|
|
word_ngrams = [ngram for ngrams in map(get_all_ngrams, lower_texts) for ngram in ngrams]
|
|
result = dict(groupby(
|
|
word_ngrams,
|
|
key_fn=lambda x: x[0],
|
|
group_fn=lambda gr: list(set(map(lambda x: x[1], gr))),
|
|
))
|
|
return result
|