@Override public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) { FieldFragList ffl = new SimpleFieldFragList( fragCharSize ); List<WeightedPhraseInfo> wpil = new ArrayList<>(); Iterator<WeightedPhraseInfo> ite = fieldPhraseList.phraseList.iterator(); WeightedPhraseInfo phraseInfo = null; while( true ){ if( !ite.hasNext() ) break; phraseInfo = ite.next(); if( phraseInfo == null ) break; wpil.add( phraseInfo ); } if( wpil.size() > 0 ) ffl.add( 0, Integer.MAX_VALUE, wpil ); return ffl; }
public void testWeightedPhraseInfoComparisonConsistency() { WeightedPhraseInfo a = newInfo( 0, 0, 1 ); WeightedPhraseInfo b = newInfo( 1, 2, 1 ); WeightedPhraseInfo c = newInfo( 2, 3, 1 ); WeightedPhraseInfo d = newInfo( 0, 0, 1 ); WeightedPhraseInfo e = newInfo( 0, 0, 2 ); assertConsistentEquals( a, a ); assertConsistentEquals( b, b ); assertConsistentEquals( c, c ); assertConsistentEquals( d, d ); assertConsistentEquals( e, e ); assertConsistentEquals( a, d ); assertConsistentLessThan( a, b ); assertConsistentLessThan( b, c ); assertConsistentLessThan( a, c ); assertConsistentLessThan( a, e ); assertConsistentLessThan( e, b ); assertConsistentLessThan( e, c ); assertConsistentLessThan( d, b ); assertConsistentLessThan( d, c ); assertConsistentLessThan( d, e ); }
@Override public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) { FieldFragList ffl = new SimpleFieldFragList( fragCharSize ); List<WeightedPhraseInfo> wpil = new ArrayList<WeightedPhraseInfo>(); Iterator<WeightedPhraseInfo> ite = fieldPhraseList.phraseList.iterator(); WeightedPhraseInfo phraseInfo = null; while( true ){ if( !ite.hasNext() ) break; phraseInfo = ite.next(); if( phraseInfo == null ) break; wpil.add( phraseInfo ); } if( wpil.size() > 0 ) ffl.add( 0, Integer.MAX_VALUE, wpil ); return ffl; }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { float totalBoost = 0; List<SubInfo> subInfos = new ArrayList<>(); for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseInfo.getBoost() ) ); totalBoost += phraseInfo.getBoost(); } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { List<SubInfo> tempSubInfos = new ArrayList<>(); List<SubInfo> realSubInfos = new ArrayList<>(); HashSet<String> distinctTerms = new HashSet<>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ float phraseTotalBoost = 0; for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) phraseTotalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } tempSubInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseTotalBoost ) ); } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. float norm = length * ( 1 / (float)Math.sqrt( length ) ); float totalBoost = 0; for ( SubInfo tempSubInfo : tempSubInfos ) { float subInfoBoost = tempSubInfo.getBoost() * norm; realSubInfos.add( new SubInfo( tempSubInfo.getText(), tempSubInfo.getTermsOffsets(), tempSubInfo.getSeqnum(), subInfoBoost )); totalBoost += subInfoBoost; } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, realSubInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { float totalBoost = 0; List<SubInfo> subInfos = new ArrayList<SubInfo>(); for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) ); totalBoost += phraseInfo.getBoost(); } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { float totalBoost = 0; List<SubInfo> subInfos = new ArrayList<SubInfo>(); HashSet<String> distinctTerms = new HashSet<String>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) ); for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) totalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. totalBoost *= length * ( 1 / Math.sqrt( length ) ); getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { float totalBoost = 0; List<SubInfo> subInfos = new ArrayList<SubInfo>(); for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseInfo.getBoost() ) ); totalBoost += phraseInfo.getBoost(); } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { List<SubInfo> tempSubInfos = new ArrayList<SubInfo>(); List<SubInfo> realSubInfos = new ArrayList<SubInfo>(); HashSet<String> distinctTerms = new HashSet<String>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ float phraseTotalBoost = 0; for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) phraseTotalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } tempSubInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseTotalBoost ) ); } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. float norm = length * ( 1 / (float)Math.sqrt( length ) ); float totalBoost = 0; for ( SubInfo tempSubInfo : tempSubInfos ) { float subInfoBoost = tempSubInfo.getBoost() * norm; realSubInfos.add( new SubInfo( tempSubInfo.getText(), tempSubInfo.getTermsOffsets(), tempSubInfo.getSeqnum(), subInfoBoost )); totalBoost += subInfoBoost; } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, realSubInfos, totalBoost ) ); }
protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){ if( fragCharSize < minFragCharSize ) throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." ); List<WeightedPhraseInfo> wpil = new ArrayList<>(); IteratorQueue<WeightedPhraseInfo> queue = new IteratorQueue<>(fieldPhraseList.getPhraseList().iterator()); WeightedPhraseInfo phraseInfo = null; int startOffset = 0; while((phraseInfo = queue.top()) != null){ // if the phrase violates the border of previous fragment, discard it and try next phrase if( phraseInfo.getStartOffset() < startOffset ) { queue.removeTop(); continue; } wpil.clear(); final int currentPhraseStartOffset = phraseInfo.getStartOffset(); int currentPhraseEndOffset = phraseInfo.getEndOffset(); int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset); int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize); if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { wpil.add(phraseInfo); } while((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd if (phraseInfo.getEndOffset() <= spanEnd) { currentPhraseEndOffset = phraseInfo.getEndOffset(); if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { wpil.add(phraseInfo); } } else { break; } } if (wpil.isEmpty()) { continue; } final int matchLen = currentPhraseEndOffset - currentPhraseStartOffset; // now recalculate the start and end position to "center" the result final int newMargin = Math.max(0, (fragCharSize-matchLen)/2); // matchLen can be > fragCharSize prevent IAOOB here spanStart = currentPhraseStartOffset - newMargin; if (spanStart < startOffset) { spanStart = startOffset; } // whatever is bigger here we grow this out spanEnd = spanStart + Math.max(matchLen, fragCharSize); startOffset = spanEnd; fieldFragList.add(spanStart, spanEnd, wpil); } return fieldFragList; }
private WeightedPhraseInfo newInfo( int startOffset, int endOffset, float boost ) { LinkedList< TermInfo > infos = new LinkedList<>(); infos.add( new TermInfo( TestUtil.randomUnicodeString(random()), startOffset, endOffset, 0, 0 ) ); return new WeightedPhraseInfo( infos, boost ); }
protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){ if( fragCharSize < minFragCharSize ) throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." ); List<WeightedPhraseInfo> wpil = new ArrayList<WeightedPhraseInfo>(); Iterator<WeightedPhraseInfo> ite = fieldPhraseList.getPhraseList().iterator(); WeightedPhraseInfo phraseInfo = null; int startOffset = 0; boolean taken = false; while( true ){ if( !taken ){ if( !ite.hasNext() ) break; phraseInfo = ite.next(); } taken = false; if( phraseInfo == null ) break; // if the phrase violates the border of previous fragment, discard it and try next phrase if( phraseInfo.getStartOffset() < startOffset ) continue; wpil.clear(); wpil.add( phraseInfo ); int firstOffset = phraseInfo.getStartOffset(); int st = phraseInfo.getStartOffset() - margin < startOffset ? startOffset : phraseInfo.getStartOffset() - margin; int en = st + fragCharSize; if( phraseInfo.getEndOffset() > en ) en = phraseInfo.getEndOffset(); int lastEndOffset = phraseInfo.getEndOffset(); while( true ){ if( ite.hasNext() ){ phraseInfo = ite.next(); taken = true; if( phraseInfo == null ) break; } else break; if( phraseInfo.getEndOffset() <= en ){ wpil.add( phraseInfo ); lastEndOffset = phraseInfo.getEndOffset(); } else break; } int matchLen = lastEndOffset - firstOffset; //now recalculate the start and end position to "center" the result int newMargin = (fragCharSize-matchLen)/2; st = firstOffset - newMargin; if(st<startOffset){ st = startOffset; } en = st+fragCharSize; startOffset = en; fieldFragList.add( st, en, wpil ); } return fieldFragList; }
protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){ if( fragCharSize < minFragCharSize ) throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." ); List<WeightedPhraseInfo> wpil = new ArrayList<WeightedPhraseInfo>(); IteratorQueue<WeightedPhraseInfo> queue = new IteratorQueue<WeightedPhraseInfo>(fieldPhraseList.getPhraseList().iterator()); WeightedPhraseInfo phraseInfo = null; int startOffset = 0; while((phraseInfo = queue.top()) != null){ // if the phrase violates the border of previous fragment, discard it and try next phrase if( phraseInfo.getStartOffset() < startOffset ) { queue.removeTop(); continue; } wpil.clear(); final int currentPhraseStartOffset = phraseInfo.getStartOffset(); int currentPhraseEndOffset = phraseInfo.getEndOffset(); int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset); int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize); if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { wpil.add(phraseInfo); } while((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd if (phraseInfo.getEndOffset() <= spanEnd) { currentPhraseEndOffset = phraseInfo.getEndOffset(); if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { wpil.add(phraseInfo); } } else { break; } } if (wpil.isEmpty()) { continue; } final int matchLen = currentPhraseEndOffset - currentPhraseStartOffset; // now recalculate the start and end position to "center" the result final int newMargin = Math.max(0, (fragCharSize-matchLen)/2); // matchLen can be > fragCharSize prevent IAOOB here spanStart = currentPhraseStartOffset - newMargin; if (spanStart < startOffset) { spanStart = startOffset; } // whatever is bigger here we grow this out spanEnd = spanStart + Math.max(matchLen, fragCharSize); startOffset = spanEnd; fieldFragList.add(spanStart, spanEnd, wpil); } return fieldFragList; }
private WeightedPhraseInfo newInfo( int startOffset, int endOffset, float boost ) { LinkedList< TermInfo > infos = new LinkedList< TermInfo >(); infos.add( new TermInfo( _TestUtil.randomUnicodeString( random() ), startOffset, endOffset, 0, 0 ) ); return new WeightedPhraseInfo( infos, boost ); }
/** * A predicate to decide if the given {@link WeightedPhraseInfo} should be * accepted as a highlighted phrase or if it should be discarded. * <p> * The default implementation discards phrases that are composed of more than one term * and where the matchLength exceeds the fragment character size. * * @param info the phrase info to accept * @param matchLength the match length of the current phrase * @param fragCharSize the configured fragment character size * @return <code>true</code> if this phrase info should be accepted as a highligh phrase */ protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) { return info.getTermsOffsets().size() <= 1 || matchLength <= fragCharSize; }
/** * convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos * * @param startOffset start offset of the fragment * @param endOffset end offset of the fragment * @param phraseInfoList list of WeightedPhraseInfo objects */ public abstract void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList );