James Moger
2012-03-14 7c7e942e29be432c081ff71362a756769e6bbe17
src/com/gitblit/utils/LuceneUtils.java
@@ -15,6 +15,8 @@
 */
package com.gitblit.utils;
import static org.eclipse.jgit.treewalk.filter.TreeFilter.ANY_DIFF;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
@@ -23,14 +25,18 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.DateTools.Resolution;
@@ -51,23 +57,28 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.revwalk.RevObject;
import org.eclipse.jgit.revwalk.RevTree;
import org.eclipse.jgit.revwalk.RevWalk;
import org.eclipse.jgit.storage.file.FileBasedConfig;
import org.eclipse.jgit.treewalk.EmptyTreeIterator;
import org.eclipse.jgit.treewalk.TreeWalk;
import org.eclipse.jgit.treewalk.filter.AndTreeFilter;
import org.eclipse.jgit.treewalk.filter.PathFilterGroup;
import org.eclipse.jgit.treewalk.filter.TreeFilter;
import org.eclipse.jgit.util.FS;
import com.gitblit.models.IssueModel;
@@ -105,7 +116,9 @@
   private static final int INDEX_VERSION = 1;
   private static final String FIELD_OBJECT_TYPE = "type";
   private static final String FIELD_OBJECT_ID = "id";
   private static final String FIELD_ISSUE = "issue";
   private static final String FIELD_PATH = "path";
   private static final String FIELD_COMMIT = "commit";
   private static final String FIELD_BRANCH = "branch";
   private static final String FIELD_REPOSITORY = "repository";
   private static final String FIELD_SUMMARY = "summary";
@@ -241,15 +254,9 @@
    * 
    * @param repositoryName
    * @param repository
    * @param fullIndex
    *            If false blob metadata is set to the HEAD revision of each
    *            branch.  If true, each the last commit of each blob is
    *            determined to properly index the author, committer, and date.
    *            Full indexing can be time-consuming.
    * @return IndexResult
    */
   public static IndexResult reindex(String repositoryName, Repository repository,
         boolean fullIndex) {
   public static IndexResult reindex(String repositoryName, Repository repository) {
      IndexResult result = new IndexResult();
      if (!LuceneUtils.deleteIndex(repository)) {
         return result;
@@ -270,101 +277,140 @@
            }
            tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName);
         }
         ObjectReader reader = repository.newObjectReader();
         // walk through each branch
         // get the local branches
         List<RefModel> branches = JGitUtils.getLocalBranches(repository, true, -1);
         // sort them by most recently updated
         Collections.sort(branches, new Comparator<RefModel>() {
            @Override
            public int compare(RefModel ref1, RefModel ref2) {
               return ref2.getDate().compareTo(ref1.getDate());
            }
         });
         // reorder default branch to first position
         RefModel defaultBranch = null;
         ObjectId defaultBranchId = JGitUtils.getDefaultBranch(repository);
         for (RefModel branch :  branches) {
            if (branch.getObjectId().equals(defaultBranchId)) {
               defaultBranch = branch;
               break;
            }
         }
         branches.remove(defaultBranch);
         branches.add(0, defaultBranch);
         // walk through each branch
         for (RefModel branch : branches) {
            if (excludedBranches.contains(branch.getName())) {
               continue;
            }
            String branchName = branch.getName();
            RevWalk revWalk = new RevWalk(repository);
            RevCommit branchHead = revWalk.parseCommit(branch.getObjectId());
            String head = branchHead.getId().getName();
            RevWalk revWalk = new RevWalk(reader);
            RevCommit tip = revWalk.parseCommit(branch.getObjectId());
            String tipId = tip.getId().getName();
            String keyName = getBranchKey(branchName);
            config.setString(CONF_ALIAS, null, keyName, branchName);
            config.setString(CONF_BRANCH, null, keyName, head);
            config.setString(CONF_BRANCH, null, keyName, tipId);
            // index the blob contents of the tree
            TreeWalk treeWalk = new TreeWalk(repository);
            treeWalk.addTree(tip.getTree());
            treeWalk.setRecursive(true);
            Map<String, ObjectId> paths = new TreeMap<String, ObjectId>();
            while (treeWalk.next()) {
               paths.put(treeWalk.getPathString(), treeWalk.getObjectId(0));
            }
            ByteArrayOutputStream os = new ByteArrayOutputStream();
            byte[] tmp = new byte[32767];
            TreeWalk treeWalk = new TreeWalk(repository);
            treeWalk.addTree(branchHead.getTree());
            treeWalk.setRecursive(true);
            while (treeWalk.next()) {
               result.blobCount++;
               String blobPath = treeWalk.getPathString();
               RevCommit blobRev = branchHead;
               RevWalk blobWalk = null;
               if (fullIndex) {
                  // XXX this is _really_ slow, there must be a better way
                  // determine the most recent commit for this blob
                  blobWalk = new RevWalk(repository);
                  blobWalk.markStart(blobWalk.parseCommit(branch.getObjectId()));
                  TreeFilter filter = AndTreeFilter.create(
                        PathFilterGroup.createFromStrings(Collections.singleton(blobPath)),
                        TreeFilter.ANY_DIFF);
                  blobWalk.setTreeFilter(filter);
                  blobRev = blobWalk.next();
               }
               String blobAuthor = getAuthor(blobRev);
               String blobCommitter = getCommitter(blobRev);
               String blobDate = DateTools.timeToString(blobRev.getCommitTime() * 1000L,
                     Resolution.MINUTE);
               if (blobWalk != null) {
                  blobWalk.dispose();
               }
               Document doc = new Document();
               doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
               doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_OBJECT_ID, blobPath, Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO));
               doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED));
               // determine extension to compare to the extension
               // blacklist
               String ext = null;
               String name = blobPath.toLowerCase();
               if (name.indexOf('.') > -1) {
                  ext = name.substring(name.lastIndexOf('.') + 1);
            RevWalk commitWalk = new RevWalk(reader);
            commitWalk.markStart(tip);
            RevCommit commit;
            while ((paths.size() > 0) && (commit = commitWalk.next()) != null) {
               TreeWalk diffWalk = new TreeWalk(reader);
               int parentCount = commit.getParentCount();
               switch (parentCount) {
               case 0:
                  diffWalk.addTree(new EmptyTreeIterator());
                  break;
               case 1:
                  diffWalk.addTree(getTree(commitWalk, commit.getParent(0)));
                  break;
               default:
                  // skip merge commits
                  continue;
               }
               if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
                  // read the blob content
                  ObjectId entid = treeWalk.getObjectId(0);
                  FileMode entmode = treeWalk.getFileMode(0);
                  RevObject ro = revWalk.lookupAny(entid, entmode.getObjectType());
                  revWalk.parseBody(ro);
                  ObjectLoader ldr = repository.open(ro.getId(), Constants.OBJ_BLOB);
                  InputStream in = ldr.openStream();
                  os.reset();
                  int n = 0;
                  while ((n = in.read(tmp)) > 0) {
                     os.write(tmp, 0, n);
               diffWalk.addTree(getTree(commitWalk, commit));
               diffWalk.setFilter(ANY_DIFF);
               diffWalk.setRecursive(true);
               while ((paths.size() > 0) && diffWalk.next()) {
                  String path = diffWalk.getPathString();
                  if (!paths.containsKey(path)) {
                     continue;
                  }
                  in.close();
                  byte[] content = os.toByteArray();
                  String str = new String(content, "UTF-8");
                  doc.add(new Field(FIELD_CONTENT, str, Store.NO, Index.ANALYZED));
                  // remove path from set
                  ObjectId blobId = paths.remove(path);
                  result.blobCount++;
                  // index the blob metadata
                  String blobAuthor = getAuthor(commit);
                  String blobCommitter = getCommitter(commit);
                  String blobDate = DateTools.timeToString(commit.getCommitTime() * 1000L,
                        Resolution.MINUTE);
                  Document doc = new Document();
                  doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
                  doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));
                  doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));
                  doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED));
                  doc.add(new Field(FIELD_PATH, path, Store.YES, Index.ANALYZED));
                  doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO));
                  doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED));
                  doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED));
                  // determine extension to compare to the extension
                  // blacklist
                  String ext = null;
                  String name = path.toLowerCase();
                  if (name.indexOf('.') > -1) {
                     ext = name.substring(name.lastIndexOf('.') + 1);
                  }
                  // index the blob content
                  if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
                     ObjectLoader ldr = repository.open(blobId, Constants.OBJ_BLOB);
                     InputStream in = ldr.openStream();
                     int n;
                     while ((n = in.read(tmp)) > 0) {
                        os.write(tmp, 0, n);
                     }
                     in.close();
                     byte[] content = os.toByteArray();
                     String str = new String(content, Constants.CHARACTER_ENCODING);
                     doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED));
                     os.reset();
                  }
                  // add the blob to the index
                  writer.addDocument(doc);
               }
            }
            os.close();
            treeWalk.release();
            // index the head commit object
            if (indexedCommits.add(head)) {
               Document doc = createDocument(branchHead, tags.get(head));
            // index the tip commit object
            if (indexedCommits.add(tipId)) {
               Document doc = createDocument(tip, tags.get(tipId));
               doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));
               writer.addDocument(doc);
@@ -373,10 +419,10 @@
            }
            // traverse the log and index the previous commit objects
            revWalk.reset();
            revWalk.markStart(branchHead);
            RevWalk historyWalk = new RevWalk(reader);
            historyWalk.markStart(historyWalk.parseCommit(tip.getId()));
            RevCommit rev;
            while ((rev = revWalk.next()) != null) {
            while ((rev = historyWalk.next()) != null) {
               String hash = rev.getId().getName();
               if (indexedCommits.add(hash)) {
                  Document doc = createDocument(rev, tags.get(hash));
@@ -386,11 +432,11 @@
                  result.commitCount += 1;
               }
            }
            // finished
            revWalk.dispose();
         }
         // finished
         reader.release();
         // this repository has a gb-issues branch, index all issues
         if (IssueUtils.getIssuesBranch(repository) != null) {
            List<IssueModel> issues = IssueUtils.getIssues(repository, null);
@@ -415,6 +461,24 @@
         e.printStackTrace();
      }
      return result;
   }
   /**
    * Get the tree associated with the given commit.
    *
    * @param walk
    * @param commit
    * @return tree
    * @throws IOException
    */
   protected static RevTree getTree(final RevWalk walk, final RevCommit commit)
         throws IOException {
      final RevTree tree = commit.getTree();
      if (tree != null) {
         return tree;
      }
      walk.parseHeaders(commit);
      return commit.getTree();
   }
   /**
@@ -442,7 +506,7 @@
                  IndexWriter writer = getIndexWriter(repository, false);
                  writer.deleteDocuments(
                        new Term(FIELD_OBJECT_TYPE, ObjectType.issue.name()), new Term(
                              FIELD_OBJECT_ID, issueId));
                              FIELD_ISSUE, issueId));
                  writer.commit();
                  result.success = true;
                  return result;
@@ -461,7 +525,7 @@
         for (PathChangeModel path : changedPaths) {
            // delete the indexed blob
            writer.deleteDocuments(new Term(FIELD_OBJECT_TYPE, ObjectType.blob.name()),
                  new Term(FIELD_BRANCH, branch), new Term(FIELD_OBJECT_ID, path.path));
                  new Term(FIELD_BRANCH, branch), new Term(FIELD_PATH, path.path));
            // re-index the blob
            if (!ChangeType.DELETE.equals(path.changeType)) {
@@ -471,7 +535,8 @@
                     Index.NOT_ANALYZED));
               doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_BRANCH, branch, Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_OBJECT_ID, path.path, Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_PATH, path.path, Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_DATE, revDate, Store.YES, Index.NO));
               doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED));
               doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED));
@@ -488,7 +553,7 @@
                  // read the blob content
                  String str = JGitUtils.getStringContent(repository, commit.getTree(),
                        path.path);
                  doc.add(new Field(FIELD_CONTENT, str, Store.NO, Index.ANALYZED));
                  doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED));
                  writer.addDocument(doc);
               }
            }
@@ -517,7 +582,7 @@
         // delete the old issue from the index, if exists
         IndexWriter writer = getIndexWriter(repository, false);
         writer.deleteDocuments(new Term(FIELD_OBJECT_TYPE, ObjectType.issue.name()), new Term(
               FIELD_OBJECT_ID, String.valueOf(issue.id)));
               FIELD_ISSUE, String.valueOf(issue.id)));
         writer.commit();
         Document doc = createDocument(issue);
@@ -627,7 +692,7 @@
      Document doc = new Document();
      doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.issue.name(), Store.YES,
            Field.Index.NOT_ANALYZED));
      doc.add(new Field(FIELD_OBJECT_ID, issue.id, Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_ISSUE, issue.id, Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_BRANCH, IssueUtils.GB_ISSUES, Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_DATE, DateTools.dateToString(issue.created, Resolution.MINUTE),
            Store.YES, Field.Index.NO));
@@ -639,7 +704,7 @@
      doc.add(new Field(FIELD_ATTACHMENT, StringUtils.flattenStrings(attachments), Store.YES,
            Index.ANALYZED));
      doc.add(new Field(FIELD_SUMMARY, issue.summary, Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_CONTENT, issue.toString(), Store.NO, Index.ANALYZED));
      doc.add(new Field(FIELD_CONTENT, issue.toString(), Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_LABEL, StringUtils.flattenStrings(issue.getLabels()), Store.YES,
            Index.ANALYZED));
      return doc;
@@ -656,13 +721,13 @@
      Document doc = new Document();
      doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.commit.name(), Store.YES,
            Index.NOT_ANALYZED));
      doc.add(new Field(FIELD_OBJECT_ID, commit.getName(), Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_DATE, DateTools.timeToString(commit.getCommitTime() * 1000L,
            Resolution.MINUTE), Store.YES, Index.NO));
      doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), Store.YES, Index.ANALYZED));
      doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.NO, Index.ANALYZED));
      doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.YES, Index.ANALYZED));
      if (!ArrayUtils.isEmpty(tags)) {
         doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), Store.YES, Index.ANALYZED));
      }
@@ -695,13 +760,15 @@
      SearchResult result = new SearchResult();
      result.score = score;
      result.date = DateTools.stringToDate(doc.get(FIELD_DATE));
      result.summary = doc.get(FIELD_SUMMARY);
      result.summary = doc.get(FIELD_SUMMARY);
      result.author = doc.get(FIELD_AUTHOR);
      result.committer = doc.get(FIELD_COMMITTER);
      result.type = ObjectType.fromName(doc.get(FIELD_OBJECT_TYPE));
      result.repository = doc.get(FIELD_REPOSITORY);
      result.branch = doc.get(FIELD_BRANCH);
      result.id = doc.get(FIELD_OBJECT_ID);
      result.commitId = doc.get(FIELD_COMMIT);
      result.issueId = doc.get(FIELD_ISSUE);
      result.path = doc.get(FIELD_PATH);
      if (doc.get(FIELD_TAG) != null) {
         result.tags = StringUtils.getStringsFromValue(doc.get(FIELD_TAG));
      }
@@ -835,6 +902,8 @@
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            SearchResult result = createSearchResult(doc, hits[i].score);
            String content = doc.get(FIELD_CONTENT);
            result.fragment = getHighlightedFragment(analyzer, query, content);
            results.add(result);
         }
      } catch (Exception e) {
@@ -842,6 +911,37 @@
      }
      return new ArrayList<SearchResult>(results);
   }
   private static String getHighlightedFragment(Analyzer analyzer, Query query,
         String content) throws IOException, InvalidTokenOffsetsException {
      content = content == null ? "":StringUtils.escapeForHtml(content, false);
      TokenStream stream = TokenSources.getTokenStream("content", content, analyzer);
      QueryScorer scorer = new QueryScorer(query, "content");
      Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 150);
      SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"highlight\">", "</span>");
      Highlighter highlighter = new Highlighter(formatter, scorer);
      highlighter.setTextFragmenter(fragmenter);
      String [] fragments = highlighter.getBestFragments(stream, content, 5);
      if (ArrayUtils.isEmpty(fragments)) {
         return content;
      }
      if (fragments.length == 1) {
         return "<pre>" + fragments[0] + "</pre>";
      }
      StringBuilder sb = new StringBuilder();
      for (int i = 0, len = fragments.length; i < len; i++) {
         String fragment = fragments[i].trim();
         sb.append("<pre>");
         sb.append(fragment);
         sb.append("</pre>");
         if (i < len - 1) {
            sb.append("<span class=\"ellipses\">...</span><br/>");
         }
      }
      return sb.toString();
   }
   /**
    * Close all the index writers and searchers