黄天不服苦心人,终于实现了,用trac(最早那个版本700M数据)的数据测了一下,和lucene的原有算法排序基本一致,共享一下代码:很粗陋,还需要改很多:
package org.apache.lucene.BM25;
import java.io.IOException;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import org.apache.lucene.BM25.bm25.BM25BooleanQuery.BooleanTermQuery;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
public class BM25OfMine {
private Set<Term> terms = new HashSet<Term>();
private Term t[] = null;// 查询输入
private IndexReader reader = null;
private TermDocs termDocs = null;
private int DocNo[] = null;// 打分的文档编号
private float length[] = null;// 文档的长度
private double Score[] = null;// 文档的得分
private int NumOfDoc = 0;// 文档的总数
// =========暂时存储
private final int Doc[] = new int[200];
private int frq[] = new int[200];
// 这里读代码不是很清楚,为什么要给它们固定值,也许要比reader功能底层一些
private double AverLength = 100;// 所有文档的平均长度
private int NumofTheDoc = 0;// 包含某个词的文档数
private int TimeofDoc[] = new int[1];
// Searcher searcher = null;
public BM25OfMine(IndexReader r) {
this.reader = r;
// this.searcher = new IndexSearcher(reader);
}
public void ConQuery(Query q) {
// some way to
if (q instanceof BooleanQuery) {
List<BooleanClause> clauses = ((BooleanQuery) q).clauses();
for (int i = 0; i < clauses.size(); i++) {
clauses.get(i).getQuery().extractTerms(terms);
}
// Iterator<Term> iter = terms.iterator();
// while (iter.hasNext())
// {
// System.out.println(iter.next().toString()+"**");
// }
} else if (q instanceof TermQuery) {
q.extractTerms(terms);
// Iterator<Term> iter = terms.iterator();
// while (iter.hasNext())
// {
// System.out.println(iter.next().toString()+"**");
// }
}
// else if(q instanceof )
t = new Term[terms.size()];
Iterator<Term> iter = terms.iterator();
int i = 0;
while (iter.hasNext()) {
// System.out.println(iter.next().toString()+"**");
t[i] = iter.next();
i++;
}
System.out.println(t.length);
for (int j = 0; j < t.length; j++)
System.out.println("查的词: " + t[j].toString());
}
// ==================================================================
public int GetDocNum()// 获取文档总数
{
return reader.maxDoc();
}
public int GetTermDocNum(Term t)// 获得一个词出现的文档数
{
int s = 0;
try {
s = reader.docFreq(t);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return s;
}
public double GetIDF(Term t) {
int nq = GetTermDocNum(t);
return (double) Math.log(((double) (NumOfDoc - nq + 0.5))
/ ((double) (nq + 0.5)) + 1.0);
}
// =================================
public double GetTermFreq(Term t) {
try {
termDocs = reader.termDocs(t);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
termDocs.read(Doc, frq);
// 读出一个词出现的所有文档编号,以及词的次数
// 1---5
// 2---6
// 3---5
// 4---13
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return 0;
}
public double GetTF(int f, double length) {
double k = 2.0;
double b = 0.75;
double result = 0;
result = ((double) (f * (k + 1)))
/ ((double) (f + k * ((1 - b) + b * length / AverLength)));
return result;
}
// =======================================================
public double GetLength(int DocNo[]) throws Exception// 写入每一个文档的长度
{
length = new float[DocNo.length];
for (int i = 0; i < DocNo.length; i++) {
// reader.document(DocNo[i]).get("title").length();
length[i] = 1.0f;
}
return 0;
}
public double GetAverLength() {
double aver = 0;
double sum = 0.0;
for (int i = 0; i < DocNo.length; i++) {
sum = sum + (double) length[i];
}
return 1.0;
}
// =============================================================
public void Score() throws Exception// 按照term进行打分,
{
for (int i = 0; i < t.length; i++) {
if (i == 0) { // 第一次进入要进行一些初始化
GetTermFreq(t[i]);
int j;
for (j = 0; j < Doc.length; j++) {
if (Doc[j] == 0 && Doc[j + 1] == 0)
break;// 编号为零的文档
}
DocNo = new int[j];
Score = new double[j];
TimeofDoc = new int[j];
for (j = 0; j < DocNo.length; j++) {
DocNo[j] = Doc[j];
TimeofDoc[j] = 1;// 文档出现了一个关键词
}
GetLength(DocNo);// 记录每一个文档的长度到全局变量length
AverLength = GetAverLength();// 计算平均长度
for (int k = 0; k < DocNo.length; k++) {
Score[k] = GetTF(frq[k], length[k]) * GetIDF(t[i]);
}
// for(int ii=0;ii<DocNo.length;ii++)
// {
// System.out.println("文档编号:"+DocNo[ii]+"分数:"+Score[ii]+"路径"+
// reader.document(DocNo[ii]).get("path")+"出现次数:"+frq[ii]);
// }
} else {
GetTermFreq(t[i]);
for (int j = 0; j < Doc.length; j++) {
if (Doc[j] == 0 && Doc[j + 1] == 0)
break;
else {
for (int k = 0; k < DocNo.length; k++) {
if (Doc[j] == DocNo[k]) {
// System.out.println("之前"+DocNo[k]+Score[k]);
Score[k] = Score[k] + GetTF(frq[j], length[k])
* GetIDF(t[i]);
// System.out.println("之后"+DocNo[k]+Score[k]);
TimeofDoc[k]++;
}
}
}
}
}
}
}
public void search(Query q) throws Exception {
NumOfDoc = GetDocNum();
ConQuery(q);
// =================================
// System.out.println(t[0].toString());
// int a = GetTermDocNum(t[0]);
// System.out.println("一个词出现在多少文档:"+a);
// double d =GetIDF(t[0]);
// System.out.println("IDF:"+d);
// ==============================================
// GetTermFreq(t[0]);
// for(int i=0;i<Doc.length;i++)
// System.out.println("次数:"+Doc[i]+" "+frq[i]);
// System.out.println(reader.document(10).get("path"));
// ======================================================
Score();
for (int ii = 0; ii < DocNo.length; ii++) {
if (TimeofDoc[ii] == t.length)// 如果全部出现
System.out.println("文档编号:" + DocNo[ii] + " 分数:" + Score[ii]
+ " 路径:" + reader.document(DocNo[ii]).get("path"));
}
}
}
今天的文章BM25_BM25是什么意思分享到此就结束了,感谢您的阅读。
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/69987.html