From eecaad8b8e2c447429c31a01d49260ddd6b4ee03 Mon Sep 17 00:00:00 2001 From: Paul Martin <paul@paulsputer.com> Date: Sat, 16 Apr 2016 17:35:32 -0400 Subject: [PATCH] Proof of concept #1026 --- src/main/java/com/gitblit/service/LuceneService.java | 141 ++++++++++++++++++++++++++-------------------- 1 files changed, 79 insertions(+), 62 deletions(-) diff --git a/src/main/java/com/gitblit/service/LuceneService.java b/src/main/java/com/gitblit/service/LuceneService.java index 97fe9e1..62f7df7 100644 --- a/src/main/java/com/gitblit/service/LuceneService.java +++ b/src/main/java/com/gitblit/service/LuceneService.java @@ -19,9 +19,9 @@ import java.io.ByteArrayOutputStream; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.lang.reflect.Method; import java.text.MessageFormat; import java.text.ParseException; import java.util.ArrayList; @@ -42,15 +42,16 @@ import org.apache.lucene.document.DateTools.Resolution; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.Field.Index; -import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.Term; -import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; @@ -66,6 +67,11 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.sax.BodyContentHandler; import org.eclipse.jgit.diff.DiffEntry.ChangeType; import org.eclipse.jgit.lib.Constants; import org.eclipse.jgit.lib.FileMode; @@ -85,8 +91,11 @@ import org.slf4j.LoggerFactory; import com.gitblit.Constants.SearchObjectType; +import com.gitblit.GitBlit; import com.gitblit.IStoredSettings; import com.gitblit.Keys; +import com.gitblit.manager.FilestoreManager; +import com.gitblit.manager.IFilestoreManager; import com.gitblit.manager.IRepositoryManager; import com.gitblit.models.PathModel.PathChangeModel; import com.gitblit.models.RefModel; @@ -105,7 +114,7 @@ public class LuceneService implements Runnable { - private static final int INDEX_VERSION = 5; + private static final int INDEX_VERSION = 6; private static final String FIELD_OBJECT_TYPE = "type"; private static final String FIELD_PATH = "path"; @@ -125,12 +134,14 @@ private static final String CONF_ALIAS = "aliases"; private static final String CONF_BRANCH = "branches"; - private static final Version LUCENE_VERSION = Version.LUCENE_35; + private static final Version LUCENE_VERSION = Version.LUCENE_4_10_0; private final Logger logger = LoggerFactory.getLogger(LuceneService.class); private final IStoredSettings storedSettings; private final IRepositoryManager repositoryManager; + private final IFilestoreManager filestoreManager; + private final File repositoriesFolder; private final Map<String, IndexSearcher> searchers = new ConcurrentHashMap<String, IndexSearcher>(); @@ -141,10 +152,12 @@ public LuceneService( IStoredSettings settings, - IRepositoryManager repositoryManager) { + IRepositoryManager repositoryManager, + IFilestoreManager filestoreManager) { this.storedSettings = settings; this.repositoryManager = repositoryManager; + this.filestoreManager = filestoreManager; this.repositoriesFolder = repositoryManager.getRepositoriesFolder(); String exts = luceneIgnoreExtensions; if (settings != null) { @@ -194,7 +207,7 @@ * Synchronously indexes a repository. This may build a complete index of a * repository or it may update an existing index. * - * @param name + * @param displayName * the name of the repository * @param repository * the repository object @@ -437,7 +450,7 @@ // skip non-annotated tags continue; } - if (!tags.containsKey(tag.getObjectId())) { + if (!tags.containsKey(tag.getReferencedObjectId().getName())) { tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>()); } tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName); @@ -476,8 +489,8 @@ && branch.equals(defaultBranch)) { // indexing "default" branch indexBranch = true; - } else if (branch.getName().startsWith(com.gitblit.Constants.R_GITBLIT)) { - // skip Gitblit internal branches + } else if (branch.getName().startsWith(com.gitblit.Constants.R_META)) { + // skip internal meta branches indexBranch = false; } else { // normal explicit branch check @@ -540,7 +553,8 @@ if (!paths.containsKey(path)) { continue; } - +//TODO: Figure out filestore oid the path - bit more involved than updating the index + // remove path from set ObjectId blobId = paths.remove(path); result.blobCount++; @@ -552,13 +566,13 @@ Resolution.MINUTE); Document doc = new Document(); - doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); - doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_PATH, path, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO)); - doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED)); + doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED)); + doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED)); + doc.add(new Field(FIELD_PATH, path, TextField.TYPE_STORED)); + doc.add(new Field(FIELD_DATE, blobDate, StringField.TYPE_STORED)); + doc.add(new Field(FIELD_AUTHOR, blobAuthor, TextField.TYPE_STORED)); + doc.add(new Field(FIELD_COMMITTER, blobCommitter, TextField.TYPE_STORED)); // determine extension to compare to the extension // blacklist @@ -579,7 +593,7 @@ in.close(); byte[] content = os.toByteArray(); String str = StringUtils.decodeString(content, encodings); - doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_CONTENT, str, TextField.TYPE_STORED)); os.reset(); } @@ -593,7 +607,7 @@ // index the tip commit object if (indexedCommits.add(tipId)) { Document doc = createDocument(tip, tags.get(tipId)); - doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED)); writer.addDocument(doc); result.commitCount += 1; result.branchCount += 1; @@ -607,7 +621,7 @@ String hash = rev.getId().getName(); if (indexedCommits.add(hash)) { Document doc = createDocument(rev, tags.get(hash)); - doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED)); writer.addDocument(doc); result.commitCount += 1; } @@ -615,7 +629,7 @@ } // finished - reader.release(); + reader.close(); // commit all changes and reset the searcher config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION); @@ -660,14 +674,13 @@ if (!ChangeType.DELETE.equals(path.changeType)) { result.blobCount++; Document doc = new Document(); - doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), Store.YES, - Index.NOT_ANALYZED)); - doc.add(new Field(FIELD_BRANCH, branch, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_PATH, path.path, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_DATE, revDate, Store.YES, Index.NO)); - doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED)); + doc.add(new Field(FIELD_BRANCH, branch, TextField.TYPE_STORED)); + doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED)); + doc.add(new Field(FIELD_PATH, path.path, TextField.TYPE_STORED)); + doc.add(new Field(FIELD_DATE, revDate, StringField.TYPE_STORED)); + doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), TextField.TYPE_STORED)); + doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), TextField.TYPE_STORED)); // determine extension to compare to the extension // blacklist @@ -678,11 +691,26 @@ } if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) { + String str = ""; // read the blob content - String str = JGitUtils.getStringContent(repository, commit.getTree(), + if (path.isFilestoreItem()) { + //Get file from filestore + BodyContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + PDFParser parser = new PDFParser(); + + ParseContext parseContext = new ParseContext(); + File lfsFile = filestoreManager.getStoragePath(path.getFilestoreOid()); + FileInputStream inputstream = new FileInputStream(lfsFile); + parser.parse(inputstream, handler, metadata, parseContext); + str = handler.toString(); + } else { + str = JGitUtils.getStringContent(repository, commit.getTree(), path.path, encodings); + } + if (str != null) { - doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_CONTENT, str, TextField.TYPE_STORED)); writer.addDocument(doc); } } @@ -700,7 +728,7 @@ // create and write the Lucene document Document doc = createDocument(commit, commitTags); - doc.add(new Field(FIELD_BRANCH, branch, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branch, TextField.TYPE_STORED)); result.commitCount++; result.success = index(repositoryName, doc); } catch (Exception e) { @@ -761,7 +789,7 @@ // skip non-annotated tags continue; } - if (!tags.containsKey(tag.getObjectId())) { + if (!tags.containsKey(tag.getObjectId().getName())) { tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>()); } tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName); @@ -808,8 +836,8 @@ && branch.equals(defaultBranch)) { // indexing "default" branch indexBranch = true; - } else if (branch.getName().startsWith(com.gitblit.Constants.R_GITBLIT)) { - // ignore internal Gitblit branches + } else if (branch.getName().startsWith(com.gitblit.Constants.R_META)) { + // ignore internal meta branches indexBranch = false; } else { // normal explicit branch check @@ -880,17 +908,16 @@ */ private Document createDocument(RevCommit commit, List<String> tags) { Document doc = new Document(); - doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.commit.name(), Store.YES, - Index.NOT_ANALYZED)); - doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.commit.name(), StringField.TYPE_STORED)); + doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED)); doc.add(new Field(FIELD_DATE, DateTools.timeToString(commit.getCommitTime() * 1000L, - Resolution.MINUTE), Store.YES, Index.NO)); - doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.YES, Index.ANALYZED)); + Resolution.MINUTE), StringField.TYPE_STORED)); + doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), TextField.TYPE_STORED)); + doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), TextField.TYPE_STORED)); + doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), TextField.TYPE_STORED)); + doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), TextField.TYPE_STORED)); if (!ArrayUtils.isEmpty(tags)) { - doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), TextField.TYPE_STORED)); } return doc; } @@ -952,7 +979,7 @@ IndexSearcher searcher = searchers.get(repository); if (searcher == null) { IndexWriter writer = getIndexWriter(repository); - searcher = new IndexSearcher(IndexReader.open(writer, true)); + searcher = new IndexSearcher(DirectoryReader.open(writer, true)); searchers.put(repository, searcher); } return searcher; @@ -1106,6 +1133,7 @@ content = ""; } + int tabLength = storedSettings.getInteger(Keys.web.tabLength, 4); int fragmentLength = SearchObjectType.commit == result.type ? 512 : 150; QueryScorer scorer = new QueryScorer(query, "content"); @@ -1128,7 +1156,7 @@ if (fragment.length() > fragmentLength) { fragment = fragment.substring(0, fragmentLength) + "..."; } - return "<pre class=\"text\">" + StringUtils.escapeForHtml(fragment, true) + "</pre>"; + return "<pre class=\"text\">" + StringUtils.escapeForHtml(fragment, true, tabLength) + "</pre>"; } // make sure we have unique fragments @@ -1226,25 +1254,14 @@ */ private class MultiSourceReader extends MultiReader { - final Method method; - - MultiSourceReader(IndexReader[] subReaders) { - super(subReaders); - Method m = null; - try { - m = MultiReader.class.getDeclaredMethod("readerIndex", int.class); - m.setAccessible(true); - } catch (Exception e) { - logger.error("Error getting readerIndex method", e); - } - method = m; + MultiSourceReader(IndexReader [] readers) { + super(readers, false); } int getSourceIndex(int docId) { int index = -1; try { - Object o = method.invoke(this, docId); - index = (Integer) o; + index = super.readerIndex(docId); } catch (Exception e) { logger.error("Error getting source index", e); } -- Gitblit v1.9.1