Skip to content

lucene 10源码分析

Posted on:June 27, 2022 at 10:50 PM

背景

我家里的电脑的lucene是10版本的

创建索引和保存

### 断点
java -agentlib:jdwp=transport=dt_socket,server=y,address=8000 -cp /home/dai/lucene/lucene/demo/build/libs/lucene-demo-10.0.0-SNAPSHOT.jar:/home/dai/lucene/lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar  org.apache.lucene.demo.IndexFiles  -docs  /home/dai/docs
### jdb 调试
jdb -attach 8000 -sourcepath /home/dai/lucene/lucene/demo/src/java/:/home/dai/lucene/lucene/core/src/java/

分词

倒排索引和分词都在这块代码

main[1] where
  [1] org.apache.lucene.index.IndexingChain$PerField.invert (IndexingChain.java:1,140)
  [2] org.apache.lucene.index.IndexingChain.processField (IndexingChain.java:729)
  [3] org.apache.lucene.index.IndexingChain.processDocument (IndexingChain.java:620)
  [4] org.apache.lucene.index.DocumentsWriterPerThread.updateDocuments (DocumentsWriterPerThread.java:241)
  [5] org.apache.lucene.index.DocumentsWriter.updateDocuments (DocumentsWriter.java:432)
  [6] org.apache.lucene.index.IndexWriter.updateDocuments (IndexWriter.java:1,531)
  [7] org.apache.lucene.index.IndexWriter.updateDocument (IndexWriter.java:1,816)
  [8] org.apache.lucene.index.IndexWriter.addDocument (IndexWriter.java:1,469)
  [9] org.apache.lucene.demo.IndexFiles.indexDoc (IndexFiles.java:271)
  [10] org.apache.lucene.demo.IndexFiles$1.visitFile (IndexFiles.java:212)
  [11] org.apache.lucene.demo.IndexFiles$1.visitFile (IndexFiles.java:208)
  [12] java.nio.file.Files.walkFileTree (Files.java:2,811)
  [13] java.nio.file.Files.walkFileTree (Files.java:2,882)
  [14] org.apache.lucene.demo.IndexFiles.indexDocs (IndexFiles.java:206)
  [15] org.apache.lucene.demo.IndexFiles.main (IndexFiles.java:157)
Step completed: "thread=main", org.apache.lucene.index.TermsHashPerField.add(), line=193 bci=22
193        int termID = bytesHash.add(termBytes);

main[1] print termBytes
 termBytes = "[2f 68 6f 6d 65 2f 64 61 69 2f 64 6f 63 73 2f 62 62 62 2e 74 78 74]"

invert

倒排索引,核心是构造一个term=>doc 的映射。比较核心的类是lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java,这是

  @Override
  void addTerm(final int termID, final int docID) {
    final FreqProxPostingsArray postings = freqProxPostingsArray;
    assert !hasFreq || postings.termFreqs[termID] > 0;

    if (!hasFreq) {
      assert postings.termFreqs == null;
      if (termFreqAtt.getTermFrequency() != 1) {
        throw new IllegalStateException(
            "field \""
                + getFieldName()
                + "\": must index term freq while using custom TermFrequencyAttribute");
      }
      if (docID != postings.lastDocIDs[termID]) {
        // New document; now encode docCode for previous doc:
        assert docID > postings.lastDocIDs[termID];
        writeVInt(0, postings.lastDocCodes[termID]);
        postings.lastDocCodes[termID] = docID - postings.lastDocIDs[termID];
        postings.lastDocIDs[termID] = docID;
        fieldState.uniqueTermCount++;
      }
    } else if (docID != postings.lastDocIDs[termID]) {
      assert docID > postings.lastDocIDs[termID]
          : "id: " + docID + " postings ID: " + postings.lastDocIDs[termID] + " termID: " + termID;
      // Term not yet seen in the current doc but previously
      // seen in other doc(s) since the last flush

      // Now that we know doc freq for previous doc,
      // write it & lastDocCode
      if (1 == postings.termFreqs[termID]) {
        writeVInt(0, postings.lastDocCodes[termID] | 1);
      } else {
        writeVInt(0, postings.lastDocCodes[termID]);
        writeVInt(0, postings.termFreqs[termID]);
      }

      // Init freq for the current document
      postings.termFreqs[termID] = getTermFreq();
      fieldState.maxTermFrequency =
          Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
      postings.lastDocCodes[termID] = (docID - postings.lastDocIDs[termID]) << 1;
      postings.lastDocIDs[termID] = docID;
      if (hasProx) {
        writeProx(termID, fieldState.position);
        if (hasOffsets) {
          postings.lastOffsets[termID] = 0;
          writeOffsets(termID, fieldState.offset);
        }
      } else {
        assert !hasOffsets;
      }
      fieldState.uniqueTermCount++;
    } else {
      postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], getTermFreq());
      fieldState.maxTermFrequency =
          Math.max(fieldState.maxTermFrequency, postings.termFreqs[termID]);
      if (hasProx) {
        writeProx(termID, fieldState.position - postings.lastPositions[termID]);
        if (hasOffsets) {
          writeOffsets(termID, fieldState.offset);
        }
      }
    }
  }

生成termId

堆栈

main[1] where
  [1] org.apache.lucene.index.TermsHashPerField.initStreamSlices (TermsHashPerField.java:150)
  [2] org.apache.lucene.index.TermsHashPerField.add (TermsHashPerField.java:198)
  [3] org.apache.lucene.index.IndexingChain$PerField.invert (IndexingChain.java:1,224)
  [4] org.apache.lucene.index.IndexingChain.processField (IndexingChain.java:729)
  [5] org.apache.lucene.index.IndexingChain.processDocument (IndexingChain.java:620)
  [6] org.apache.lucene.index.DocumentsWriterPerThread.updateDocuments (DocumentsWriterPerThread.java:241)
  [7] org.apache.lucene.index.DocumentsWriter.updateDocuments (DocumentsWriter.java:432)
  [8] org.apache.lucene.index.IndexWriter.updateDocuments (IndexWriter.java:1,531)
  [9] org.apache.lucene.index.IndexWriter.updateDocument (IndexWriter.java:1,816)
  [10] org.apache.lucene.index.IndexWriter.addDocument (IndexWriter.java:1,469)
  [11] org.apache.lucene.demo.IndexFiles.indexDoc (IndexFiles.java:271)
  [12] org.apache.lucene.demo.IndexFiles$1.visitFile (IndexFiles.java:212)
  [13] org.apache.lucene.demo.IndexFiles$1.visitFile (IndexFiles.java:208)
  [14] java.nio.file.Files.walkFileTree (Files.java:2,811)
  [15] java.nio.file.Files.walkFileTree (Files.java:2,882)
  [16] org.apache.lucene.demo.IndexFiles.indexDocs (IndexFiles.java:206)
  [17] org.apache.lucene.demo.IndexFiles.main (IndexFiles.java:157)
      IntBlockPool intPool,
      ByteBlockPool bytePool,
      ByteBlockPool termBytePool,

首先介绍intPool这个变量这个变量维护了一个二维数组int buffers[][]和三个偏移量来保存bytePool的偏移量。

public final class IntBlockPool {
...

    // 类初始化是10 , 后面会自动扩容,核心结构 , 这个二维数组存的是bytePool 偏移量,默认初始化容量是10
  public int[][] buffers = new int[10][];

  // 二维数组偏移量,也就是联合buffers使用 。一般这样用  buffers[bufferUpto+offset]
  private int bufferUpto = -1;
  // 二维数组中的一维数组 , 描述的是最新写入的buffers   
  //  举例  buffer = buffers[1];
  public int[] buffer;
  //intUpto 描述的是相对于一维数组的偏移
  public int intUpto = INT_BLOCK_SIZE;
  // 绝对偏移 ,相对于二维数组的偏移 ,有点像计算机里面的相对跳转和绝对跳转
  public int intOffset = -INT_BLOCK_SIZE;
}

然后和intPool一样,bytePooltermBytePool 也是用几个变量加一个二维数组描述

public final class ByteBlockPool implements Accountable {
...
    // 核心结构,一个二维数组
  public byte[][] buffers = new byte[10][];

  /** index into the buffers array pointing to the current buffer used as the head */
  private int bufferUpto = -1; // Which buffer we are upto
  /** Where we are in head buffer */
  public int byteUpto = BYTE_BLOCK_SIZE;

  /** Current head buffer */
  public byte[] buffer;
  /** Current head offset */
  public int byteOffset = -BYTE_BLOCK_SIZE;

查询搜索

断点

## 断点
java -agentlib:jdwp=transport=dt_socket,server=y,address=8000 -cp /home/dai/lucene/lucene/demo/build/libs/lucene-demo-10.0.0-SNAPSHOT.jar:/home/dai/lucene/lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar:/home/dai/lucene/lucene/queryparser/build/libs/lucene-queryparser-10.0.0-SNAPSHOT.jar  org.apache.lucene.demo.SearchFiles

## jdb 调试
jdb -attach 8000 -sourcepath /home/dai/lucene/lucene/demo/src/java/:/home/dai/lucene/lucene/core/src/java/

termState描述的是term的统计信息

ain[1] print termState
 termState = "TermStates
  state=docFreq=1 totalTermFreq=1 termBlockOrd=2 blockFP=0 docStartFP=63 posStartFP=63 payStartFP=0 lastPosBlockOffset=-1 singletonDocID=6
"
main[1] print term
 term = "contents:am"
main[1] where
  [1] org.apache.lucene.search.TermQuery.createWeight (TermQuery.java:233)
  [2] org.apache.lucene.search.IndexSearcher.createWeight (IndexSearcher.java:894)
  [3] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:686)
  [4] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [6] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [7] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

排序

默认排序是BM25Similarity

main[1] where
  [1] org.apache.lucene.search.similarities.BM25Similarity.scorer (BM25Similarity.java:200)
  [2] org.apache.lucene.search.TermQuery$TermWeight.<init> (TermQuery.java:75)
  [3] org.apache.lucene.search.TermQuery.createWeight (TermQuery.java:233)
  [4] org.apache.lucene.search.IndexSearcher.createWeight (IndexSearcher.java:894)
  [5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:686)
  [6] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [8] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [9] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

核心搜索参数

main[1] list
763            // there is no doc of interest in this reader context
764            // continue with the following leaf
765            continue;
766          }
767 =>       BulkScorer scorer = weight.bulkScorer(ctx);
768          if (scorer != null) {
769            try {
770              scorer.score(leafCollector, ctx.reader().getLiveDocs());
771            } catch (
772                @SuppressWarnings("unused")
main[1] where
  [1] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:767)
  [2] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
  [3] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [4] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [6] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [7] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

获取reader

Step completed: "thread=main", org.apache.lucene.index.LeafReaderContext.reader(), line=67 bci=0
67        return reader;

main[1] print reader
 reader = "_0(10.0.0):c7:[diagnostics={source=flush, os.arch=amd64, java.runtime.version=17.0.3+7-Ubuntu-0ubuntu0.22.04.1, os.version=5.15.0-33-generic, java.vendor=Private Build, os=Linux, timestamp=1656601918836, java.version=17.0.3, java.vm.version=17.0.3+7-Ubuntu-0ubuntu0.22.04.1, lucene.version=10.0.0}]:[attributes={Lucene90StoredFieldsFormat.mode=BEST_SPEED}] :id=c276i3vlaza4c6uumuxapfnvf"
main[1] where
  [1] org.apache.lucene.index.LeafReaderContext.reader (LeafReaderContext.java:67)
  [2] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
  [3] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
  [4] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [5] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [7] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [8] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

其中的reader 对象

main[1] dump reader
 reader = {
    si: instance of org.apache.lucene.index.SegmentCommitInfo(id=1531)
    originalSi: instance of org.apache.lucene.index.SegmentCommitInfo(id=1532)
    metaData: instance of org.apache.lucene.index.LeafMetaData(id=1533)
    liveDocs: null
    hardLiveDocs: null
    numDocs: 7
    core: instance of org.apache.lucene.index.SegmentCoreReaders(id=1534)
    segDocValues: instance of org.apache.lucene.index.SegmentDocValues(id=1535)
    isNRT: false
    docValuesProducer: null
    fieldInfos: instance of org.apache.lucene.index.FieldInfos(id=1536)
    readerClosedListeners: instance of java.util.concurrent.CopyOnWriteArraySet(id=1537)
    readerCacheHelper: instance of org.apache.lucene.index.SegmentReader$1(id=1538)
    coreCacheHelper: instance of org.apache.lucene.index.SegmentReader$2(id=1539)
    $assertionsDisabled: true
    org.apache.lucene.index.LeafReader.readerContext: instance of org.apache.lucene.index.LeafReaderContext(id=1540)
    org.apache.lucene.index.LeafReader.$assertionsDisabled: true
    org.apache.lucene.index.IndexReader.closed: false
    org.apache.lucene.index.IndexReader.closedByChild: false
    org.apache.lucene.index.IndexReader.refCount: instance of java.util.concurrent.atomic.AtomicInteger(id=1541)
    org.apache.lucene.index.IndexReader.parentReaders: instance of java.util.Collections$SynchronizedSet(id=1542)
}

排序:

main[1] list
222    
223        @Override
224        public int score(LeafCollector collector, Bits acceptDocs, int min, int max)
225            throws IOException {
226 =>       collector.setScorer(scorer);
227          DocIdSetIterator scorerIterator = twoPhase == null ? iterator : twoPhase.approximation();
228          DocIdSetIterator competitiveIterator = collector.competitiveIterator();
229          DocIdSetIterator filteredIterator;
230          if (competitiveIterator == null) {
231            filteredIterator = scorerIterator;
main[1] where
  [1] org.apache.lucene.search.Weight$DefaultBulkScorer.score (Weight.java:226)
  [2] org.apache.lucene.search.BulkScorer.score (BulkScorer.java:38)
  [3] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
  [4] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
  [5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [6] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [8] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [9] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

排序

  private static class SimpleTopScoreDocCollector extends TopScoreDocCollector {

    ...

    @Override
    public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {   
      ...
      return new ScorerLeafCollector() {
          ...
        @Override
        public void collect(int doc) throws IOException {
          float score = scorer.score();            <----   这里不用传docId 就能获取score  ,是因为可以从父类TopScoreDocCollector 获取docId

          // This collector relies on the fact that scorers produce positive values:
          assert score >= 0; // NOTE: false for NaN

          totalHits++;
          hitsThresholdChecker.incrementHitCount();

          if (minScoreAcc != null && (totalHits & minScoreAcc.modInterval) == 0) {
            updateGlobalMinCompetitiveScore(scorer);
          }

          if (score <= pqTop.score) {
            if (totalHitsRelation == TotalHits.Relation.EQUAL_TO) {
              // we just reached totalHitsThreshold, we can start setting the min
              // competitive score now
              updateMinCompetitiveScore(scorer);
            }
            // Since docs are returned in-order (i.e., increasing doc Id), a document
            // with equal score to pqTop.score cannot compete since HitQueue favors
            // documents with lower doc Ids. Therefore reject those docs too.
            return;
          }
          pqTop.doc = doc + docBase;
          pqTop.score = score;
          pqTop = pq.updateTop();
          updateMinCompetitiveScore(scorer);
        }
      };
    }
main[1] print scorer
 scorer = "scorer(weight(contents:am))[org.apache.lucene.search.TermScorer@290dbf45]"
main[1] dump scorer
 scorer = {
    postingsEnum: instance of org.apache.lucene.index.SlowImpactsEnum(id=1546)
    impactsEnum: instance of org.apache.lucene.index.SlowImpactsEnum(id=1546)
    iterator: instance of org.apache.lucene.search.ImpactsDISI(id=1547)
    docScorer: instance of org.apache.lucene.search.LeafSimScorer(id=1548)
    impactsDisi: instance of org.apache.lucene.search.ImpactsDISI(id=1547)
    $assertionsDisabled: true
    org.apache.lucene.search.Scorer.weight: instance of org.apache.lucene.search.TermQuery$TermWeight(id=1549)
}
main[1] where
  [1] org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1.collect (TopScoreDocCollector.java:76)     <--- 这里没有传doc_id 进去scorer 是因为有个回调, 可以获取doc_id , 这里会有歌pq,是一个排序好的doc
  [2] org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll (Weight.java:305)   
  [3] org.apache.lucene.search.Weight$DefaultBulkScorer.score (Weight.java:247)
  [4] org.apache.lucene.search.BulkScorer.score (BulkScorer.java:38)
  [5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
  [6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
  [7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [8] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [9] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [10] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [11] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)
main[1] 

核心算分函数

排序算分

main[1] list
246          // float. And then monotonicity is preserved through composition via
247          // x -> 1 + x and x -> 1 - 1/x.
248          // Finally we expand weight * (1 - 1 / (1 + freq * 1/norm)) to
249          // weight - weight / (1 + freq * 1/norm), which runs slightly faster.
250 =>       float normInverse = cache[((byte) encodedNorm) & 0xFF];
251          return weight - weight / (1f + freq * normInverse);
252        }
253    
254        @Override
255        public Explanation explain(Explanation freq, long encodedNorm) {
main[1] where
  [1] org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer.score (BM25Similarity.java:250)
  [2] org.apache.lucene.search.LeafSimScorer.score (LeafSimScorer.java:60)
  [3] org.apache.lucene.search.TermScorer.score (TermScorer.java:75)
  [4] org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1.collect (TopScoreDocCollector.java:73)
  [5] org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll (Weight.java:305)
  [6] org.apache.lucene.search.Weight$DefaultBulkScorer.score (Weight.java:247)
  [7] org.apache.lucene.search.BulkScorer.score (BulkScorer.java:38)
  [8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
  [9] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
  [10] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [11] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [12] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [13] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [14] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

reduce 过程

main[1] list
60       * Populates the results array with the ScoreDoc instances. This can be overridden in case a
61       * different ScoreDoc type should be returned.
62       */
63      protected void populateResults(ScoreDoc[] results, int howMany) {
64 =>     for (int i = howMany - 1; i >= 0; i--) {
65          results[i] = pq.pop();
66        }
67      }
68    
69      /**
main[1] where
  [1] org.apache.lucene.search.TopDocsCollector.populateResults (TopDocsCollector.java:64)
  [2] org.apache.lucene.search.TopDocsCollector.topDocs (TopDocsCollector.java:166)
  [3] org.apache.lucene.search.TopDocsCollector.topDocs (TopDocsCollector.java:98)
  [4] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:526)
  [5] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:505)
  [6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:694)
  [7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [8] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [9] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [10] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [11] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

辅助函数,获取topk的数据内容

堆栈:

main[1] where
  [1] org.apache.lucene.search.TopDocs.mergeAux (TopDocs.java:312)
  [2] org.apache.lucene.search.TopDocs.merge (TopDocs.java:216)
  [3] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:528)
  [4] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:505)
  [5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:694)
  [6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [7] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [9] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [10] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)
  /**
   * Auxiliary method used by the {@link #merge} impls. A sort value of null is used to indicate
   * that docs should be sorted by score.
   */
  private static TopDocs mergeAux(
      Sort sort, int start, int size, TopDocs[] shardHits, Comparator<ScoreDoc> tieBreaker) {

    final PriorityQueue<ShardRef> queue;
    if (sort == null) {
      queue = new ScoreMergeSortQueue(shardHits, tieBreaker);
    } else {
      queue = new MergeSortQueue(sort, shardHits, tieBreaker);
    }

    long totalHitCount = 0;
    TotalHits.Relation totalHitsRelation = TotalHits.Relation.EQUAL_TO;
    int availHitCount = 0;
    for (int shardIDX = 0; shardIDX < shardHits.length; shardIDX++) {
      final TopDocs shard = shardHits[shardIDX];
      // totalHits can be non-zero even if no hits were
      // collected, when searchAfter was used:
      totalHitCount += shard.totalHits.value;
      // If any hit count is a lower bound then the merged
      // total hit count is a lower bound as well
      if (shard.totalHits.relation == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO) {
        totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
      }
      if (shard.scoreDocs != null && shard.scoreDocs.length > 0) {
        availHitCount += shard.scoreDocs.length;
        queue.add(new ShardRef(shardIDX));
      }
    }

    final ScoreDoc[] hits;
    boolean unsetShardIndex = false;
    if (availHitCount <= start) {
      hits = new ScoreDoc[0];
    } else {
      hits = new ScoreDoc[Math.min(size, availHitCount - start)];
      int requestedResultWindow = start + size;
      int numIterOnHits = Math.min(availHitCount, requestedResultWindow);
      int hitUpto = 0;
      while (hitUpto < numIterOnHits) {
        assert queue.size() > 0;
        ShardRef ref = queue.top();
        final ScoreDoc hit = shardHits[ref.shardIndex].scoreDocs[ref.hitIndex++];

        // Irrespective of whether we use shard indices for tie breaking or not, we check for
        // consistent
        // order in shard indices to defend against potential bugs
        if (hitUpto > 0) {
          if (unsetShardIndex != (hit.shardIndex == -1)) {
            throw new IllegalArgumentException("Inconsistent order of shard indices");
          }
        }

        unsetShardIndex |= hit.shardIndex == -1;

        if (hitUpto >= start) {
          hits[hitUpto - start] = hit;
        }

        hitUpto++;

        if (ref.hitIndex < shardHits[ref.shardIndex].scoreDocs.length) {
          // Not done with this these TopDocs yet:
          queue.updateTop();
        } else {
          queue.pop();
        }
      }
    }

    TotalHits totalHits = new TotalHits(totalHitCount, totalHitsRelation);
    if (sort == null) {
      return new TopDocs(totalHits, hits);
    } else {
      return new TopFieldDocs(totalHits, hits, sort.getSort());
    }
  }

通过docid 获取对应的文档

        fieldsStream.seek(startPointer);
        decompressor.decompress(fieldsStream, totalLength, offset, length, bytes);
        assert bytes.length == length;
        documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);

堆栈:

main[1] where
  [1] org.apache.lucene.store.ByteBufferIndexInput$SingleBufferImpl.seek (ByteBufferIndexInput.java:576)
  [2] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader$BlockState.document (Lucene90CompressingStoredFieldsReader.java:594)
  [3] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.document (Lucene90CompressingStoredFieldsReader.java:610)
  [4] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.visitDocument (Lucene90CompressingStoredFieldsReader.java:628)
  [5] org.apache.lucene.index.CodecReader.document (CodecReader.java:89)
  [6] org.apache.lucene.index.BaseCompositeReader.document (BaseCompositeReader.java:154)
  [7] org.apache.lucene.index.IndexReader.document (IndexReader.java:380)
  [8] org.apache.lucene.search.IndexSearcher.doc (IndexSearcher.java:380)
  [9] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:214)
  [10] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

mmap加载文件到内存:

Breakpoint hit: "thread=main", org.apache.lucene.store.ByteBufferIndexInput.setCurBuf(), line=86 bci=0
86        this.curBuf = curBuf;

main[1] where
  [1] org.apache.lucene.store.ByteBufferIndexInput.setCurBuf (ByteBufferIndexInput.java:86)
  [2] org.apache.lucene.store.ByteBufferIndexInput$SingleBufferImpl.<init> (ByteBufferIndexInput.java:556)
  [3] org.apache.lucene.store.ByteBufferIndexInput.newInstance (ByteBufferIndexInput.java:63)
  [4] org.apache.lucene.store.MMapDirectory.openInput (MMapDirectory.java:238)
  [5] org.apache.lucene.store.Directory.openChecksumInput (Directory.java:152)
  [6] org.apache.lucene.index.SegmentInfos.readCommit (SegmentInfos.java:290)
  [7] org.apache.lucene.index.StandardDirectoryReader$1.doBody (StandardDirectoryReader.java:88)
  [8] org.apache.lucene.index.StandardDirectoryReader$1.doBody (StandardDirectoryReader.java:77)
  [9] org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run (SegmentInfos.java:798)
  [10] org.apache.lucene.index.StandardDirectoryReader.open (StandardDirectoryReader.java:109)
  [11] org.apache.lucene.index.StandardDirectoryReader.open (StandardDirectoryReader.java:67)
  [12] org.apache.lucene.index.DirectoryReader.open (DirectoryReader.java:60)
  [13] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:105)

很明显,打开文件是在org.apache.lucene.store.MMapDirectory.openInput 这个类实现就是打开文件。

先打开文件segments_1

main[1] print name
 name = "segments_1"
main[1] list
228    
229      /** Creates an IndexInput for the file with the given name. */
230      @Override
231      public IndexInput openInput(String name, IOContext context) throws IOException {
232 =>     ensureOpen();
233        ensureCanRead(name);
234        Path path = directory.resolve(name);
235        try (FileChannel c = FileChannel.open(path, StandardOpenOption.READ)) {
236          final String resourceDescription = "MMapIndexInput(path=\"" + path.toString() + "\")";
237          final boolean useUnmap = getUseUnmap();
main[1] 

举例读取字符串:

  private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInfo info, int bits)
      throws IOException {
    switch (bits & TYPE_MASK) {
      case BYTE_ARR:
        int length = in.readVInt();
        byte[] data = new byte[length];
        in.readBytes(data, 0, length);
        visitor.binaryField(info, data);
        break;
      case STRING:
        visitor.stringField(info, in.readString());
        break;
      case NUMERIC_INT:
        visitor.intField(info, in.readZInt());
        break;
      case NUMERIC_FLOAT:
        visitor.floatField(info, readZFloat(in));
        break;
      case NUMERIC_LONG:
        visitor.longField(info, readTLong(in));
        break;
      case NUMERIC_DOUBLE:
        visitor.doubleField(info, readZDouble(in));
        break;
      default:
        throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits));
    }
  }
main[1] where
  [1] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.readField (Lucene90CompressingStoredFieldsReader.java:246)
  [2] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.visitDocument (Lucene90CompressingStoredFieldsReader.java:640)
  [3] org.apache.lucene.index.CodecReader.document (CodecReader.java:89)
  [4] org.apache.lucene.index.BaseCompositeReader.document (BaseCompositeReader.java:154)
  [5] org.apache.lucene.index.IndexReader.document (IndexReader.java:380)
  [6] org.apache.lucene.search.IndexSearcher.doc (IndexSearcher.java:380)
  [7] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:214)
  [8] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)
main[1] 
main[1] list
66      }
67    
68      @Override
69      public void stringField(FieldInfo fieldInfo, String value) throws IOException {
70 =>     final FieldType ft = new FieldType(TextField.TYPE_STORED);
71        ft.setStoreTermVectors(fieldInfo.hasVectors());
72        ft.setOmitNorms(fieldInfo.omitsNorms());
73        ft.setIndexOptions(fieldInfo.getIndexOptions());
74        doc.add(
75            new StoredField(
main[1] print value
 value = "/home/dai/docs/aaa.txt"
main[1] where
  [1] org.apache.lucene.document.DocumentStoredFieldVisitor.stringField (DocumentStoredFieldVisitor.java:70)
  [2] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.readField (Lucene90CompressingStoredFieldsReader.java:246)
  [3] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.visitDocument (Lucene90CompressingStoredFieldsReader.java:640)
  [4] org.apache.lucene.index.CodecReader.document (CodecReader.java:89)
  [5] org.apache.lucene.index.BaseCompositeReader.document (BaseCompositeReader.java:154)
  [6] org.apache.lucene.index.IndexReader.document (IndexReader.java:380)
  [7] org.apache.lucene.search.IndexSearcher.doc (IndexSearcher.java:380)
  [8] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:214)
  [9] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

将读到的string 加载到doc对象里面

这是核心函数 , mmap 读取文件,然后seek 算出偏移和长度 ,从文件中读取出来并构造成对象


    /**
     * Get the serialized representation of the given docID. This docID has to be contained in the
     * current block.
     */
    SerializedDocument document(int docID) throws IOException {
      if (contains(docID) == false) {
        throw new IllegalArgumentException();
      }

      final int index = docID - docBase;
      final int offset = Math.toIntExact(offsets[index]);
      final int length = Math.toIntExact(offsets[index + 1]) - offset;
      final int totalLength = Math.toIntExact(offsets[chunkDocs]);
      final int numStoredFields = Math.toIntExact(this.numStoredFields[index]);

      final BytesRef bytes;
      if (merging) {
        bytes = this.bytes;
      } else {
        bytes = new BytesRef();
      }
      ... 
        fieldsStream.seek(startPointer);  // 计算偏移量
        decompressor.decompress(fieldsStream, totalLength, offset, length, bytes);   // 解压内容
        assert bytes.length == length;
        documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);  // 将内容塞到对象里面
      }

      return new SerializedDocument(documentInput, length, numStoredFields);
    }
  }

获取doc

Breakpoint hit: "thread=main", org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum.advance(), line=498 bci=0
498          if (docFreq > BLOCK_SIZE && target > nextSkipDoc) {

main[1] where
  [1] org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum.advance (Lucene90PostingsReader.java:498)
  [2] org.apache.lucene.index.SlowImpactsEnum.advance (SlowImpactsEnum.java:77)
  [3] org.apache.lucene.search.ImpactsDISI.advance (ImpactsDISI.java:128)
  [4] org.apache.lucene.search.ImpactsDISI.nextDoc (ImpactsDISI.java:133)
  [5] org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll (Weight.java:301)
  [6] org.apache.lucene.search.Weight$DefaultBulkScorer.score (Weight.java:247)
  [7] org.apache.lucene.search.BulkScorer.score (BulkScorer.java:38)
  [8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
  [9] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
  [10] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [11] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [12] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [13] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [14] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

term query 和遍历

注意到 ImpactsEnum 实现了iteratorDocId

1,138      }
1,139    
1,140      @Override
1,141      public ImpactsEnum impacts(int flags) throws IOException {
1,142 =>     assert !eof;
1,143        // if (DEBUG) {
1,144        // System.out.println("BTTR.docs seg=" + segment);
1,145        // }
1,146        currentFrame.decodeMetaData();
1,147        // if (DEBUG) {
main[1] where
  [1] org.apache.lucene.codecs.lucene90.blocktree.SegmentTermsEnum.impacts (SegmentTermsEnum.java:1,142)
  [2] org.apache.lucene.search.TermQuery$TermWeight.scorer (TermQuery.java:114)
  [3] org.apache.lucene.search.Weight.bulkScorer (Weight.java:166)
  [4] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:767)
  [5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
  [6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [7] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [9] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [10] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

注意到PostingsEnum 也有docidIterater

排序topk


main[1] where
  [1] org.apache.lucene.util.PriorityQueue.upHeap (PriorityQueue.java:276)
  [2] org.apache.lucene.util.PriorityQueue.add (PriorityQueue.java:161)
  [3] org.apache.lucene.search.TopDocs.mergeAux (TopDocs.java:303)
  [4] org.apache.lucene.search.TopDocs.merge (TopDocs.java:216)
  [5] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:528)
  [6] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:505)
  [7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:694)
  [8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
  [9] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
  [10] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
  [11] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
  [12] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)


   @Override
    public boolean lessThan(ShardRef first, ShardRef second) {
      assert first != second;
      ScoreDoc firstScoreDoc = shardHits[first.shardIndex][first.hitIndex];
      ScoreDoc secondScoreDoc = shardHits[second.shardIndex][second.hitIndex];
      if (firstScoreDoc.score < secondScoreDoc.score) {
        return false;
      } else if (firstScoreDoc.score > secondScoreDoc.score) {
        return true;
      } else {
        return tieBreakLessThan(first, firstScoreDoc, second, secondScoreDoc, tieBreakerComparator);
      }

相关阅读