短文本相似度最好算法_短文本相似度最好算法

短文本相似度最好算法_短文本相似度最好算法代码中会用到一些模型,模型在上述的原文链接里,如果链接已失效,可以从这里下载

原文链接:GitHub – liuhuanyong/SentenceSimilarity: self complement of Sentence Similarity compute based on cilin, hownet, simhash, wordvector,vsm models,基于同义词词林,知网,指纹,字词向量,向量空间模型的句子相似度计算。

原文有几段代码无法正常运行,我对其进行了相关修改。

先展示一下最终效果吧:

import import_ipynb

from 字符向量 import *
from 词向量 import *
from 词林 import *
from 知网 import *
from 指纹 import *

text1 = "我是中国人,我深爱着我的祖国"
text2 = "中国是我的母亲,我热爱她"

print("基于同义词词林的相似度:",SimCilin().distance(text1, text2))
print("基于词向量的相似度:",SimWordVec().distance(text1, text2))
print("基于知网的相似度:",SimHownet().distance(text1, text2))
print("基于指纹的相似度:",SimHaming().distance(text1, text2))
print("基于字符向量的相似度:",SimTokenVec().distance(text1, text2))

短文本相似度最好算法_短文本相似度最好算法

话不多说,直接上代码:

# 基于同义词词林的相似度计算

import codecs
import jieba.posseg as pseg
class SimCilin:

    def __init__(self):
        self.cilin_path = 'model/cilin.txt'
        self.sem_dict = self.load_semantic()

    '''加载语义词典'''
    def load_semantic(self):
        sem_dict = {}
        for line in codecs.open(self.cilin_path , 'r', 'utf-8'):
            line = line.strip().split(' ')
            sem_type = line[0]
            words = line[1:]
            for word in words:
                if word not in sem_dict:
                    sem_dict[word] = sem_type
                else:
                    sem_dict[word] += ';' + sem_type

        for word, sem_type in sem_dict.items():
            sem_dict[word] = sem_type.split(';')
        return sem_dict

    '''比较计算词语之间的相似度,取max最大值'''
    def compute_word_sim(self, word1 , word2):
        sems_word1 = self.sem_dict.get(word1, [])
        sems_word2 = self.sem_dict.get(word2, [])
        score_list = [self.compute_sem(sem_word1, sem_word2) for sem_word1 in sems_word1 for sem_word2 in sems_word2]
        if score_list:
            return max(score_list)
        else:
            return 0

    '''基于语义计算词语相似度'''
    def compute_sem(self, sem1, sem2):
        sem1 = [sem1[0], sem1[1], sem1[2:4], sem1[4], sem1[5:7], sem1[-1]]
        sem2 = [sem2[0], sem2[1], sem2[2:4], sem2[4], sem2[5:7], sem2[-1]]
        score = 0
        for index in range(len(sem1)):
            if sem1[index] == sem2[index]:
                if index in [0, 1]:
                    score += 3
                elif index == 2:
                    score += 2
                elif index in [3, 4]:
                    score += 1
        return score/10

    '''基于词相似度计算句子相似度'''
    def distance(self, text1, text2):
        words1 = [word.word for word in pseg.cut(text1) if word.flag[0] not in ['u', 'x', 'w']]
        words2 = [word.word for word in pseg.cut(text2) if word.flag[0] not in ['u', 'x', 'w']]
        score_words1 = []
        score_words2 = []
        for word1 in words1:
            score = max(self.compute_word_sim(word1, word2) for word2 in words2)
            score_words1.append(score)
        for word2 in words2:
            score = max(self.compute_word_sim(word2, word1) for word1 in words1)
            score_words2.append(score)
        similarity = max(sum(score_words1)/len(words1), sum(score_words2)/len(words2))

        return similarity

# 基于词向量的相似度计算

import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import numpy as np
import jieba.posseg as pesg

class SimWordVec:
    def __init__(self):
        self.embedding_path = 'model/token_vector.bin'
        self.model = gensim.models.KeyedVectors.load_word2vec_format(self.embedding_path, binary=False)
    '''获取词向量'''
    def get_wordvector(self, word):#获取词向量
        try:
            return self.model[word]
        except:
            return np.zeros(200)
    '''基于余弦相似度计算句子之间的相似度,句子向量等于字符向量求平均'''
    def similarity_cosine(self, word_list1,word_list2):#给予余弦相似度的相似度计算
        vector1 = np.zeros(200)
        for word in word_list1:
            vector1 += self.get_wordvector(word)
        vector1=vector1/len(word_list1)
        vector2=np.zeros(200)
        for word in word_list2:
            vector2 += self.get_wordvector(word)
        vector2=vector2/len(word_list2)
        cos1 = np.sum(vector1*vector2)
        cos21 = np.sqrt(sum(vector1**2))
        cos22 = np.sqrt(sum(vector2**2))
        similarity = cos1/float(cos21*cos22)
        return  similarity
    '''计算句子相似度'''
    def distance(self, text1, text2):#相似性计算主函数
        word_list1=[word.word for word in pesg.cut(text1) if word.flag[0] not in ['w','x','u']]
        word_list2=[word.word for word in pesg.cut(text2) if word.flag[0] not in ['w','x','u']]
        return self.similarity_cosine(word_list1,word_list2)

# def test():
#     text1 = '我喜歡你'
#     text2 = '我愛你'
#     simer = SimWordVec()
#     print(simer.distance(text1, text2))

# test()

# 基于知网的相似度计算

import chardet
import jieba.posseg as pseg

class SimHownet:
    def __init__(self):
        self.semantic_path = 'model/hownet.dat'
        self.semantic_dict = self.load_semanticwords()

    '''加载语义词典'''
    def load_semanticwords(self):
        # 打开文件以二进制模式读取数据
        with open(self.semantic_path, 'rb') as f:
            # 使用 chardet 库自动检测文件编码
            result = chardet.detect(f.read())

        # 打开文件并以检测到的编码格式解码数据
        with open(self.semantic_path, 'r', encoding=result['encoding']) as f:
            semantic_dict = {}
            for line in f:
                words = [word for word in line.strip().replace(' ', '>').replace('\t', '>').split('>') if word != '']
                word = words[0]
                word_def = words[2]
                semantic_dict[word] = word_def.split(',')
        return semantic_dict


    '''基于语义计算语义相似度'''
    def calculate_semantic(self, DEF1, DEF2):
        DEF_INTERSECTION = set(DEF1).intersection(set(DEF2))
        DEF_UNION = set(DEF1).union(set(DEF2))
        return float(len(DEF_INTERSECTION))/float(len(DEF_UNION))

    '''比较两个词语之间的相似度'''
    def compute_similarity(self, word1, word2):
        DEFS_word1 = self.semantic_dict.get(word1, [])
        DEFS_word2 = self.semantic_dict.get(word2, [])
        scores = [self.calculate_semantic(DEF_word1, DEF_word2) for DEF_word1 in DEFS_word1 for DEF_word2 in DEFS_word2]
        if scores:
            return max(scores)
        else:
            return 0

    '''基于词语相似度计算句子相似度'''
    def distance(self, text1, text2):
        words1 = [word.word for word in pseg.cut(text1) if word.flag[0] not in ['u', 'x', 'w']]
        words2 = [word.word for word in pseg.cut(text2) if word.flag[0] not in ['u', 'x', 'w']]
        score_words1 = []
        score_words2 = []
        for word1 in words1:
            score = max(self.compute_similarity(word1, word2) for word2 in words2)
            score_words1.append(score)
        for word2 in words2:
            score = max(self.compute_similarity(word2, word1) for word1 in words1)
            score_words2.append(score)
        similarity = max(sum(score_words1)/len(words1), sum(score_words2)/len(words2))

        return similarity

# def test():
#     text1 = '周杰伦是一个歌手'
#     text2 = '刘若英是个演员'
    
#     simer = SimHownet()
#     sim = simer.distance(text1, text2)
#     print(sim)

# test()
# 基于指纹的相似度计算

from simhash import Simhash
import jieba.posseg as pseg

class SimHaming:
    '''利用64位数,计算海明距离'''
    def haming_distance(self, code_s1, code_s2):
        x = (code_s1 ^ code_s2) & ((1 << 64) - 1)
        ans = 0
        while x:
            ans += 1
            x &= x - 1
        return ans
    '''利用相似度计算方式,计算全文编码相似度'''
    def get_similarity(self, a, b):
        if a > b :
            return b / a
        else:
            return a / b

    '''对全文进行分词,提取全文特征,使用词性将虚词等无关字符去重'''
    def get_features(self, string):
        word_list=[word.word for word in pseg.cut(string) if word.flag[0] not in ['u','x','w','o','p','c','m','q']]
        return word_list

    '''计算两个全文编码的距离'''
    def get_distance(self, code_s1, code_s2):
        return self.haming_distance(code_s1, code_s2)

    '''对全文进行编码'''
    def get_code(self, string):
        return Simhash(self.get_features(string)).value

    '''计算s1与s2之间的距离'''
    def distance(self, s1, s2):
        code_s1 = self.get_code(s1)
        code_s2 = self.get_code(s2)
        similarity = (100 - self.haming_distance(code_s1,code_s2)*100/64)/100
        return similarity

# def test():
#     text1 = '我喜欢你'
#     text2 = '我讨厌你'
#     simer = SimHaming()
#     sim = simer.distance(text1, text2)
#     print(sim)

# test()
#基于字符向量的相似度计算

import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import numpy as np

class SimTokenVec:

    def __init__(self):
        self.embedding_path = 'model/token_vector.bin'
        self.model = gensim.models.KeyedVectors.load_word2vec_format(self.embedding_path, binary=False)

    '''获取词向量文件'''
    def get_wordvector(self, word):#获取词向量
        try:
            return self.model[word]
        except:
            return np.zeros(200)

    '''基于余弦相似度计算句子之间的相似度,句子向量等于字符向量求平均'''
    def similarity_cosine(self, word_list1,word_list2):#给予余弦相似度的相似度计算
        vector1 = np.zeros(200)
        for word in word_list1:
            vector1 += self.get_wordvector(word)
        vector1=vector1/len(word_list1)
        vector2=np.zeros(200)
        for word in word_list2:
            vector2 += self.get_wordvector(word)
        vector2=vector2/len(word_list2)
        cos1 = np.sum(vector1*vector2)
        cos21 = np.sqrt(sum(vector1**2))
        cos22 = np.sqrt(sum(vector2**2))
        similarity = cos1/float(cos21*cos22)
        return  similarity

    '''计算句子相似度'''
    def distance(self, text1, text2):#相似性计算主函数
        word_list1=[word for word in text1]
        word_list2=[word for word in text2]
        return self.similarity_cosine(word_list1,word_list2)

# def test():
#     text1 = '我喜欢你'
#     text2 = '我讨厌你'
#     simer = SimTokenVec()
#     sim = simer.distance(text1, text2)
#     print(sim)
# test()

代码中会用到一些模型,模型在上述的原文链接里,如果链接已失效,可以从这里下载

https://download.csdn.net/download/qq_53962537/87785392

今天的文章短文本相似度最好算法_短文本相似度最好算法分享到此就结束了,感谢您的阅读。

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/85752.html

(0)
编程小号编程小号

相关推荐

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注