from gensim.models import Word2Vec
from gensim import corpora, models, similarities
import numpy as np
import scipy.optimize
from scipy import spatial
import time
import jieba
import itertools
jieba.load_userdict('/export/user/shizhengxin/word2vec/jiebaUserWord.txt')
# Constraints
def positivity(f):
''' Constraint 1: Ensures flow moves from source to target '''
return f
def fromSrc(f, wp, i, shape):
""" Constraint 2: Limits supply for source according to weight """
fr = np.reshape(f, shape)
f_sumColi = np.sum(fr[i, :])
return wp[i] - f_sumColi
def toTgt(f, wq, j, shape):
""" Constraint 3: Limits demand for target according to weight """
fr = np.reshape(f, shape)
f_sumRowj = np.sum(fr[:, j])
return wq[j] - f_sumRowj
def maximiseTotalFlow(f, wp, wq):
""" Constraint 4: Forces maximum supply to move from source to target """
return f.sum() - np.minimum(wp.sum(), wq.sum())
# Objective function
def flow(f, D):
""" The objective function The flow represents the amount of goods to be moved from source to target """
f = np.reshape(f, D.shape)
return (f * D).sum()
# Distance
def groundDistance(x1, x2, norm=2):
""" L-norm distance Default norm = 2 """
return np.linalg.norm(x1 - x2, norm)
def get_normlize_data(common_list,model):
vector = []
common_list = list(common_list)
list_length = len(common_list)
coef_mat = np.zeros((list_length,list_length),dtype = float)
raw_mat = []
for i in range(len(common_list)):
raw_mat.append(model.wv[common_list[i]])
raw_mat = np.mat(raw_mat)
vector.append(model.wv[common_list[i]]/np.linalg.norm(model.wv[common_list[i]]))
return np.mat(vector)
def get_standard_ques(lists,model,dict_sim):
all_standard_ques = []
all_weights = []
all_signature = []
for i in lists:
list2 = throw_stopwords(list(jieba.cut(i)), stopwords)
list2 = list(set(list2))
list2 = [sed for sed in list2 if sed in model.wv.vocab]
list2_new, weights2, signature2 = get_signature(list2, dict_sim)
all_standard_ques.append(get_normlize_data(list2_new, model))
all_weights.append(weights2)
all_signature.append(signature2)
return all_standard_ques , all_weights , all_signature
# Distance matrix
def getDistMatrix(s1, s2, model):
""" Computes the distance matrix between the source and target distributions. The ground distance is using the L-norm (default L2 norm) """
# rows = s1 feature length
# cols = s2 feature length
# numFeats1 = s1.shape[0]
# numFeats2 = s2.shape[0]
s1 = get_normlize_data(s1,model)
s2 = get_normlize_data(s2,model)
distance_matrix = s1.dot(s2.T)
one_matrix = np.random.randint(1, 2, distance_matrix.shape)
distance_matrix = one_matrix - distance_matrix
# print(distance_matrix)
distance_matrix = np.where(distance_matrix >= 0.9, 1, distance_matrix)
distance_matrix = np.where(distance_matrix <= 0.4, 0, distance_matrix)
# print(distance_matrix)
return distance_matrix
# Flow matrix
def getFlowMatrix(P, Q, D):
""" Computes the flow matrix between P and Q """
numFeats1 = P[0].shape[0]
numFeats2 = Q[0].shape[0]
shape = (numFeats1, numFeats2)
# Constraints
cons1 = [{
'type': 'ineq', 'fun': positivity},
{
'type': 'eq', 'fun': maximiseTotalFlow, 'args': (P[1], Q[1],)}]
cons2 = [{
'type': 'ineq', 'fun': fromSrc, 'args': (P[1], i, shape,)} for i in range(numFeats1)]
cons3 = [{
'type': 'ineq', 'fun': toTgt, 'args': (Q[1], j, shape,)} for j in range(numFeats2)]
cons = cons1 + cons2 + cons3
# Solve for F (solve transportation problem)
F_guess = np.zeros(D.shape)
F = scipy.optimize.minimize(flow, F_guess, args=(D,), constraints=cons)
F = np.reshape(F.x, (numFeats1, numFeats2))
return F
# Normalised EMD
def EMD(F, D):
""" EMD formula, normalised by the flow """
return (F * D).sum() / F.sum()
# Runs EMD program
def getEMD(P, Q, model,F,D):
""" EMD computes the Earth Mover's Distance between the distributions P and Q P and Q are of shape (2,N) Where the first row are the set of N features The second row are the corresponding set of N weights The norm defines the L-norm for the ground distance Default is the Euclidean norm (norm = 2) """
# D = getDistMatrix(P[0], Q[0],model)
# F = getFlowMatrix(P, Q, D)
return EMD(F, D)
# Example 1
path = 'stopWords.txt'
def get_stopwords(path):
w = open(path,'r',encoding='utf-8')
stopword = []
for line in w.readlines():
newline = line.strip()
stopword.append(list(jieba.cut(newline)))
stopword = itertools.chain(*stopword)
w.close()
return stopword
stopwords = get_stopwords(path)
def throw_stopwords(list,stopwords):
new_list = []
for line in list:
if line not in stopwords:
new_list.append(line)
return new_list
wp = open('question_from_scrawl_200.txt','r',encoding='utf-8')
all_ques = []
for line in wp.readlines():
newline = line.strip()
all_ques.append(newline)
wp.close()
tf_idfmodels = models.TfidfModel.load('./tf_idf_v1.model')
dictionary = corpora.Dictionary.load('./dictionary_v2')
def get_similar():
ws = open('similary_word.txt','r',encoding='utf-8')
dict_sim = {}
for line in ws.readlines():
newline = line.strip().split('\t')
for word in newline[1:]:
dict_sim[word] = newline[0]
ws.close()
return dict_sim
def get_sub_list(lists,dict_sim):
new_list = []
for word in lists:
if word in dict_sim.keys():
new_list.append(dict_sim[word])
else:
new_list.append(word)
return new_list
def get_signature(listss,dict_sim):
listss = get_sub_list(listss,dict_sim)
corpus = [dictionary.doc2bow(listss)]
corpus_tfidf = list(tf_idfmodels[corpus])[0]
set_len = len(set(listss))
weights1 = []
list1_new = []
sed = 0
for id, score in corpus_tfidf:
try:
weights1.append(score)
list1_new.append(dictionary.get(id))
sed += 1
except:
pass
weights1 = list(map(lambda x : x/sum(weights1),weights1))
weights1 = np.array(weights1)
list1_new = np.array(list1_new)
signature1 = (list1_new, weights1)
return list1_new , weights1 ,signature1
def get_standard_data():
ws = open('new_faqs_data_v2.txt', 'r', encoding='utf-8')
ques_standard = [line.strip().split('----')[2] for line in ws.readlines()]
ws.close()
return ques_standard
def getExampleSignatures():
dict_sim = get_similar()
""" returns signature1[features][weights], signature2[features][weights] """
ws = open('new_faqs_data_v2.txt', 'r', encoding='utf-8')
wk = open('question_score_tfidf_weight_v3.txt', 'a', encoding='utf-8')
model = Word2Vec.load("./word2vec_test_v7.model")
ques_standard = get_standard_data()
ti = 0
all_standard_ques, all_weights, all_signature = get_standard_ques(ques_standard, model, dict_sim)
for list1_1 in all_ques:
times1 = time.time()
list1 = throw_stopwords(list(jieba.cut(list1_1)),stopwords)
list1 = list(set(list1))
list1 = [i for i in list1 if i in model.wv.vocab]
list1_new, weights1, signature1 = get_signature(list1,dict_sim)
raw_distance = {}
# print(len(all_standard_ques))
# break
for newline in range(len(all_standard_ques)) :
# print('--------------',newline)
# print(all_standard_ques[newline])
# newline = line.strip().split('----')[2]
D = getDistMatrix(list1_new, all_standard_ques[newline], model)
# times4 = time.time()
F = getFlowMatrix(signature1, all_signature[newline], D)
# times3 = time.time()
# print('times spend ' ,times3-times4)
ti += 1
# # F = np.where(F >= 0.9, 1, F)
w_matrix = np.where(F <= 0.001, 0, F)
raw_distance[newline] = getEMD(signature1, all_signature[newline], model, F, D)
if ti == 1000 :
times2 = time.time()
print('time spend', times2 - times1)
new_sort = sorted(raw_distance.items(), key=lambda x: x[1], reverse=False)
wk.writelines(list1_1+'****')
for line ,score in new_sort[:4]:
wk.writelines(line+'----'+str(score)+'----')
wk.writelines(new_sort[4][0]+'----'+str(new_sort[4][1])+'\n')
# getExampleSignatures()
def get_percent(F):
percen_10 = np.percentile(F, 70)
F = np.where(F <= percen_10, 0, F)
return F
def get_common_list(list1,list2,dict_sim):
list1 = get_sub_list(list1,dict_sim)
list2 = get_sub_list(list2,dict_sim)
''' 获取共同字典 '''
return list1 , list2 ,list(set(list1).union(set(list2))) #构建两句话的并集
def get_vector(list1,common_list):
list1 = list(set(list1))
corpus = [dictionary.doc2bow(list1)]
corpus_tfidf = list(tf_idfmodels[corpus])[0]
list1_new = {}
for id, score in corpus_tfidf:
list1_new[dictionary.get(id)] = score
''' 获取词向量 '''
vector = [0]*len(common_list)
# print(list1_new)
# print('------------')
# print(common_list)
for ele in list1_new.keys():
vector[common_list.index(ele)] =list1_new[ele] #创建BOW模型
if sum(vector) == 0:
return common_list , vector ,(list1,vector)
vector = list(map(lambda x : x/sum(vector),vector))
common_list = np.array(common_list)
weights = np.array(vector)
# vector = get_softmax(vector)
return common_list , weights ,(common_list,weights)
def get_vector2(list1,common_list):
corpus = [dictionary.doc2bow(common_list)]
corpus_tfidf = list(tf_idfmodels[corpus])[0]
list1 = list(set(list1))
list1_new = {}
for id, score in corpus_tfidf:
list1_new[dictionary.get(id)] = score
''' 获取词向量 '''
vector = [0]*len(common_list)
weights = [0]*len(common_list)
# print(list1_new)
# print('------------')
# print(common_list)
for ele in list1:
if ele in list1_new.keys():
weights[common_list.index(ele)] = list1_new[ele] #创建BOW模型
common_list = np.array(common_list)
weights = np.array(weights)
# vector = get_softmax(vector)
return common_list , weights ,(common_list,weights)
def get_F():
dict_sim = get_similar()
""" returns signature1[features][weights], signature2[features][weights] """
# ws = open('new_faqs_data_v2.txt', 'r', encoding='utf-8')
# wk = open('question_score_tfidf_weight_v3.txt', 'a', encoding='utf-8')
model = Word2Vec.load("./word2vec_test_v7.model")
while True:
print('input line:')
list1_1 = input()
list1 = throw_stopwords(list(jieba.cut(list1_1)), stopwords)
list1 = list(set(list1))
list1 = [i for i in list1 if i in model.wv.vocab]
print('input line:')
list2 = input()
list2 = throw_stopwords(list(jieba.cut(list2)), stopwords)
list2 = list(set(list2))
list2 = [sed for sed in list2 if sed in model.wv.vocab]
list1 , list2 ,common_list = get_common_list(list1,list2,dict_sim)
print(common_list)
list1_new, weights1, signature1 = get_vector(list1, common_list)
list2_new, weights2, signature2 = get_vector(list2, common_list)
D = getDistMatrix(list1_new, list2_new, model)
# times4 = time.time()
F = getFlowMatrix(signature1, signature2, D)
distance = getEMD(signature1, signature2, model, F, D)
print('原始权重', F)
print('原始wmd',distance)
F = get_percent(F)
distance = getEMD(signature1, signature2, model, F, D)
print('改进权重', F)
print('改进wmd',distance)
get_F()
今天的文章python web 源码_Python源码分享到此就结束了,感谢您的阅读。
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/69823.html