如何增加nutch索引长度?
高亮显示比较简单,网上也有很多介绍代码。修改如下:
将 org.apache.nutch.searcher.Summary 第 54行 代码 修改为:
public String toString() { return "<span style='color:red'>" + super.toString() + "</span>"; }
增加索引长度花了我比较长的时间 , 不过后来发现原来有两个参数是专门调整索引长度的 ,刚看代码的时候没有注意到 ,在org.apache.nutch.searcher.Summarizer 的36行左右 有
/** The number of context terms to display preceding and following matches.*/
private static final int SUM_CONTEXT =
NutchConf.get().getInt("searcher.summary.context", 5);
/** The total number of terms to display in a summary.*/
private static final int SUM_LENGTH =
NutchConf.get().getInt("searcher.summary.length", 100);
这 两个 是 Term 的长度 , 第一个参数是 SUM_CONTEXT 在摘要中间最多有 5个 高亮显示的关键词(注:这里的NutchConf.get().getInt()第二个参数 5表示 默认值是5,也就是在取得searcher.summary.context为NULL时候给一个默认值),
第二个SUM_LENGTH 是在摘要中最多显示 100个 Term ,这个Term 是分词得到的结果 ,在后面的摘要截取算法中需要用到 Term ,不过可以通过Luncene 的保存Term的坐标 来实现 索引关键词的快速高亮显示 ,这样的好处是可以在查询的时候不再使用分词,以减少查询相应时间。
不过如果分词系统是基于词库的,则词库增长以后会有一定问题,这个以后在做专题讨论。
下面帖一下改过的算法内容,显示文字数大约在 150个左右 ,如果需要增加到更多 ,则可以修改 相应的代码。
data:image/s3,"s3://crabby-images/52d25/52d255aa9c3633c8b32ff962baf16e1c138906be" alt="如何增加nutch索引长度?"
/***//**Returnsasummaryforthegivenpre-tokenizedtext.*/
data:image/s3,"s3://crabby-images/52d25/52d255aa9c3633c8b32ff962baf16e1c138906be" alt="如何增加nutch索引长度?"
publicSummarygetSummary(Stringtext,Queryquery)throwsIOException...{
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//Simplisticimplementation.Findsthefirstfragmentsinthedocument
//containinganyqueryterms.
//
//TODO:checkthatphrasesinthequeryarematchedinthefragment
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
Token[]tokens=getTokens(text);//parsetexttotokenarray
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
if(tokens.length==0)
returnnewSummary();
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
String[]terms=query.getTerms();
HashSethighlight=newHashSet();//putquerytermsintable
for(inti=0;i<terms.length;i++)
highlight.add(terms[i]);
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//
//CreateaSortedSetthatranksexcerptsaccordingto
//howmanyquerytermsarepresent.Anexcerptis
//aVectorfullofFragmentsandHighlights
//
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
SortedSetexcerptSet=newTreeSet(newComparator()...{
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
publicintcompare(Objecto1,Objecto2)...{
Excerptexcerpt1=(Excerpt)o1;
Excerptexcerpt2=(Excerpt)o2;
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
if(excerpt1==null&&excerpt2!=null)...{
return-1;
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
}elseif(excerpt1!=null&&excerpt2==null)...{
return1;
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
}elseif(excerpt1==null&&excerpt2==null)...{
return0;
}
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
intnumToks1=excerpt1.numUniqueTokens();
intnumToks2=excerpt2.numUniqueTokens();
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
if(numToks1<numToks2)...{
return-1;
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
}elseif(numToks1==numToks2)...{
returnexcerpt1.numFragments()-excerpt2.numFragments();
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
}else...{
return1;
}
}
}
);
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//
//Iteratethroughalltermsinthedocument
//
intlastExcerptPos=0;
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
for(inti=0;i<tokens.length;i++)...{
//
//Ifwefindatermthat'sinthequery...
//
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
if(highlight.contains(tokens[i].termText()))...{
//
//StartsearchingatapointSUM_CONTEXTtermsback,
//andmoveSUM_CONTEXTtermsintothefuture.
//
intstartToken=(i>SUM_CONTEXT)?i-SUM_CONTEXT:0;
intendToken=Math.min(i+SUM_CONTEXT*20,tokens.length);
intoffset=tokens[startToken].startOffset();
intj=startToken;
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//
//Iteratefromthestartpointtothefinish,adding
//termsalltheway.Theendofthepassageisalways
//SUM_CONTEXTbeyondthelastquery-term.
//
Excerptexcerpt=newExcerpt();
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
if(i!=0)...{
excerpt.add(newSummary.Ellipsis());
}
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//
//Iteratethroughaslongaswe'rebeforetheendof
//thedocumentandwehaven'thitthemax-number-of-items
//-in-a-summary.
//
Tokena=null;
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
while((j<endToken)&&(j-startToken<SUM_LENGTH))...{
//
//Nowgrabthehit-element,ifpresent
//
Tokent=tokens[j];
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
if(highlight.contains(t.termText()))...{
excerpt.addToken(t.termText());
//System.out.println("Text:"+text.substring(offset,t.startOffset())+"OffSet:"+offset+"Start:"+t.startOffset());
excerpt.add(newFragment(text.substring(offset,t.startOffset())));
excerpt.add(newHighlight(text.substring(t.startOffset(),
t.endOffset())));
a=(Token)t.cloneToken();
offset=a.endOffset();
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//endToken=Math.min(j+SUM_LENGTH,tokens.length);
}
j++;
}
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
...{
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
...{
if(offset<text.length()&&Math.min(endToken,
i+SUM_LENGTH)<tokens.length&&tokens[Math.min(endToken,
i+SUM_LENGTH)].endOffset()<text.length())
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
...{
excerpt.add(newFragment(text.substring(offset,
tokens[Math.min(endToken,
i+SUM_LENGTH)].endOffset())));
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
}
}
}
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
lastExcerptPos=endToken;
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//
//Wefoundtheseriesofsearch-termhitsandadded
//them(withinterveningtext)totheexcerpt.Now
//weneedtoaddthetrailingedgeoftext.
//
//Soif(j<tokens.length)thenthereisstilltrailing
//texttoadd.(Wehaven'thittheendofthesourcedoc.)
//Addthewordssincethelasthit-terminsert.
//
//if(j<tokens.length){
//System.out.println(text.length()+"Ooffset:"+offset+"EndOff:"+tokens[j].endOffset()+""+text);
//excerpt.add(newFragment(text.substring(offset,offset+tokens[j].endOffset())));
//}
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//
//Rememberhowmanytermsareinthisexcerpt
//
excerpt.setNumTerms(j-startToken);
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//
//Storetheexcerptforlatersorting
//
excerptSet.add(excerpt);
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//
//StartSUM_CONTEXTplacesaway.Thenext
//searchforrelevantexcerptsbeginsati-SUM_CONTEXT
//
i=j+SUM_CONTEXT;
}
}
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
//
//Ifthetargettextdoesn'tappear,thenwejust
//excerptthefirstSUM_LENGTHwordsfromthedocument.
//
data:image/s3,"s3://crabby-images/9afbe/9afbef3ef2924620adfa70bcc247f413cc7f1a3a" alt="如何增加nutch索引长度?"
if(excerptSet.size()==0)...{
Excerptexcerpt=newExcerpt();
intexcerptLen=Math.min(SUM_LENGTH,tokens.length);
lastExcerptPos=excerptLen;
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
excerpt.add(newFragment(text.substring(tokens[0].startOffset(),tokens[excerptLen-1].startOffset())));
excerpt.setNumTerms(excerptLen);
excerptSet.add(excerpt);
}
data:image/s3,"s3://crabby-images/6a2d5/6a2d538db05f876b2501112c14eaa858e8cd9d0e" alt="如何增加nutch索引长度?"
本文地址:
http://www.45fan.com/dnjc/72872.html