Find similar Sentences in holy Quran, Python Example

# -*- coding: utf-8 -*-
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import pandas
# Arabic Stop words
arastopword = stopwords.words('arabic')  

names = ['aya']
data = pandas.read_csv('data/qr_with/quran.txt',  names=names)
print("All Ayat: ", len(data ))
print(data)
# Enter terms
Searchfor = 'هارون وزير فرعون'

# similarity degree
simdegree = 0.2

Result = [] 
def findsimilarity(X, Y):
    # tokenization 
    X_list = word_tokenize(X)  
    Y_list = word_tokenize(Y) 
      
    # sw contains the list of stopwords 
    #sw = stopwords.words('arabic')  
    l1 =[];l2 =[] 
      
    # remove stop words from string 
    X_set = {w for w in X_list if not w in arastopword}  
    Y_set = {w for w in Y_list if not w in arastopword} 
      
    # form a set containing keywords of both strings  
    rvector = X_set.union(Y_set)  
    for w in rvector: 
        #print(w)
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y_set: l2.append(1) 
        else: l2.append(0) 
    c = 0

  
    # cosine formula  
    for i in range(len(rvector)): 
        c+= l1[i]*l2[i] 
    cosine = c / float((sum(l1)*sum(l2))**0.5) 
    
    #print("similarity: "  , cosine) 

    if cosine > simdegree:      
        Result.append([Y, round(cosine, 4)])
or d in data['aya']:
    findsimilarity(Searchfor, d)

def takeSecond(elem):
    return elem[1]

# sort the result
Result.sort(key=takeSecond)

# print the final result
for j in Result:
    print(j[0])
    print(j[1])
    print('')

One thought on “Find similar Sentences in holy Quran, Python Example

Comments are closed.

Proudly powered by WordPress | Theme: Rits Blog by Crimson Themes.