# -*- coding: utf-8 -*-
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas
# Arabic Stop words
arastopword = stopwords.words('arabic')
names = ['aya']
data = pandas.read_csv('data/qr_with/quran.txt', names=names)
print("All Ayat: ", len(data ))
print(data)
# Enter terms
Searchfor = 'هارون وزير فرعون'
# similarity degree
simdegree = 0.2
Result = []
def findsimilarity(X, Y):
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
#sw = stopwords.words('arabic')
l1 =[];l2 =[]
# remove stop words from string
X_set = {w for w in X_list if not w in arastopword}
Y_set = {w for w in Y_list if not w in arastopword}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
#print(w)
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
#print("similarity: " , cosine)
if cosine > simdegree:
Result.append([Y, round(cosine, 4)])
or d in data['aya']:
findsimilarity(Searchfor, d)
def takeSecond(elem):
return elem[1]
# sort the result
Result.sort(key=takeSecond)
# print the final result
for j in Result:
print(j[0])
print(j[1])
print('')
One thought on “Find similar Sentences in holy Quran, Python Example”
Hi, this is a comment.
To get started with moderating, editing, and deleting comments, please visit the Comments screen in the dashboard.
Commenter avatars come from Gravatar.
Comments are closed.
Proudly powered by WordPress |
Theme: Rits Blog by Crimson Themes.
Hi, this is a comment.
To get started with moderating, editing, and deleting comments, please visit the Comments screen in the dashboard.
Commenter avatars come from Gravatar.