From eecaad8b8e2c447429c31a01d49260ddd6b4ee03 Mon Sep 17 00:00:00 2001
From: Paul Martin <paul@paulsputer.com>
Date: Sat, 16 Apr 2016 17:35:32 -0400
Subject: [PATCH] Proof of concept #1026

---
 src/main/java/com/gitblit/service/LuceneService.java |  141 ++++++++++++++++++++++++++--------------------
 1 files changed, 79 insertions(+), 62 deletions(-)

diff --git a/src/main/java/com/gitblit/service/LuceneService.java b/src/main/java/com/gitblit/service/LuceneService.java
index 97fe9e1..62f7df7 100644
--- a/src/main/java/com/gitblit/service/LuceneService.java
+++ b/src/main/java/com/gitblit/service/LuceneService.java
@@ -19,9 +19,9 @@
 
 import java.io.ByteArrayOutputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.lang.reflect.Method;
 import java.text.MessageFormat;
 import java.text.ParseException;
 import java.util.ArrayList;
@@ -42,15 +42,16 @@
 import org.apache.lucene.document.DateTools.Resolution;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Field.Index;
-import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
@@ -66,6 +67,11 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.sax.BodyContentHandler;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
 import org.eclipse.jgit.lib.Constants;
 import org.eclipse.jgit.lib.FileMode;
@@ -85,8 +91,11 @@
 import org.slf4j.LoggerFactory;
 
 import com.gitblit.Constants.SearchObjectType;
+import com.gitblit.GitBlit;
 import com.gitblit.IStoredSettings;
 import com.gitblit.Keys;
+import com.gitblit.manager.FilestoreManager;
+import com.gitblit.manager.IFilestoreManager;
 import com.gitblit.manager.IRepositoryManager;
 import com.gitblit.models.PathModel.PathChangeModel;
 import com.gitblit.models.RefModel;
@@ -105,7 +114,7 @@
 public class LuceneService implements Runnable {
 
 
-	private static final int INDEX_VERSION = 5;
+	private static final int INDEX_VERSION = 6;
 
 	private static final String FIELD_OBJECT_TYPE = "type";
 	private static final String FIELD_PATH = "path";
@@ -125,12 +134,14 @@
 	private static final String CONF_ALIAS = "aliases";
 	private static final String CONF_BRANCH = "branches";
 
-	private static final Version LUCENE_VERSION = Version.LUCENE_35;
+	private static final Version LUCENE_VERSION = Version.LUCENE_4_10_0;
 
 	private final Logger logger = LoggerFactory.getLogger(LuceneService.class);
 
 	private final IStoredSettings storedSettings;
 	private final IRepositoryManager repositoryManager;
+	private final IFilestoreManager filestoreManager;
+	
 	private final File repositoriesFolder;
 
 	private final Map<String, IndexSearcher> searchers = new ConcurrentHashMap<String, IndexSearcher>();
@@ -141,10 +152,12 @@
 
 	public LuceneService(
 			IStoredSettings settings,
-			IRepositoryManager repositoryManager) {
+			IRepositoryManager repositoryManager, 
+			IFilestoreManager filestoreManager) {
 
 		this.storedSettings = settings;
 		this.repositoryManager = repositoryManager;
+		this.filestoreManager = filestoreManager;
 		this.repositoriesFolder = repositoryManager.getRepositoriesFolder();
 		String exts = luceneIgnoreExtensions;
 		if (settings != null) {
@@ -194,7 +207,7 @@
 	 * Synchronously indexes a repository. This may build a complete index of a
 	 * repository or it may update an existing index.
 	 *
-	 * @param name
+	 * @param displayName
 	 *            the name of the repository
 	 * @param repository
 	 *            the repository object
@@ -437,7 +450,7 @@
 					// skip non-annotated tags
 					continue;
 				}
-				if (!tags.containsKey(tag.getObjectId())) {
+				if (!tags.containsKey(tag.getReferencedObjectId().getName())) {
 					tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>());
 				}
 				tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName);
@@ -476,8 +489,8 @@
 						&& branch.equals(defaultBranch)) {
 					// indexing "default" branch
 					indexBranch = true;
-				} else if (branch.getName().startsWith(com.gitblit.Constants.R_GITBLIT)) {
-					// skip Gitblit internal branches
+				} else if (branch.getName().startsWith(com.gitblit.Constants.R_META)) {
+					// skip internal meta branches
 					indexBranch = false;
 				} else {
 					// normal explicit branch check
@@ -540,7 +553,8 @@
 						if (!paths.containsKey(path)) {
 							continue;
 						}
-
+//TODO: Figure out filestore oid the path - bit more involved than updating the index
+						
 						// remove path from set
 						ObjectId blobId = paths.remove(path);
 						result.blobCount++;
@@ -552,13 +566,13 @@
 								Resolution.MINUTE);
 
 						Document doc = new Document();
-						doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
-						doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));
-						doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED));
-						doc.add(new Field(FIELD_PATH, path, Store.YES, Index.ANALYZED));
-						doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO));
-						doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED));
-						doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED));
+						doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED));
+						doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
+						doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED));
+						doc.add(new Field(FIELD_PATH, path, TextField.TYPE_STORED));
+						doc.add(new Field(FIELD_DATE, blobDate, StringField.TYPE_STORED));
+						doc.add(new Field(FIELD_AUTHOR, blobAuthor, TextField.TYPE_STORED));
+						doc.add(new Field(FIELD_COMMITTER, blobCommitter, TextField.TYPE_STORED));
 
 						// determine extension to compare to the extension
 						// blacklist
@@ -579,7 +593,7 @@
 							in.close();
 							byte[] content = os.toByteArray();
 							String str = StringUtils.decodeString(content, encodings);
-							doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED));
+							doc.add(new Field(FIELD_CONTENT, str, TextField.TYPE_STORED));
 							os.reset();
 						}
 
@@ -593,7 +607,7 @@
 				// index the tip commit object
 				if (indexedCommits.add(tipId)) {
 					Document doc = createDocument(tip, tags.get(tipId));
-					doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
 					writer.addDocument(doc);
 					result.commitCount += 1;
 					result.branchCount += 1;
@@ -607,7 +621,7 @@
 					String hash = rev.getId().getName();
 					if (indexedCommits.add(hash)) {
 						Document doc = createDocument(rev, tags.get(hash));
-						doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));
+						doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
 						writer.addDocument(doc);
 						result.commitCount += 1;
 					}
@@ -615,7 +629,7 @@
 			}
 
 			// finished
-			reader.release();
+			reader.close();
 
 			// commit all changes and reset the searcher
 			config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION);
@@ -660,14 +674,13 @@
 				if (!ChangeType.DELETE.equals(path.changeType)) {
 					result.blobCount++;
 					Document doc = new Document();
-					doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), Store.YES,
-							Index.NOT_ANALYZED));
-					doc.add(new Field(FIELD_BRANCH, branch, Store.YES, Index.ANALYZED));
-					doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED));
-					doc.add(new Field(FIELD_PATH, path.path, Store.YES, Index.ANALYZED));
-					doc.add(new Field(FIELD_DATE, revDate, Store.YES, Index.NO));
-					doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED));
-					doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED));
+					doc.add(new Field(FIELD_BRANCH, branch, TextField.TYPE_STORED));
+					doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED));
+					doc.add(new Field(FIELD_PATH, path.path, TextField.TYPE_STORED));
+					doc.add(new Field(FIELD_DATE, revDate, StringField.TYPE_STORED));
+					doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), TextField.TYPE_STORED));
+					doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), TextField.TYPE_STORED));
 
 					// determine extension to compare to the extension
 					// blacklist
@@ -678,11 +691,26 @@
 					}
 
 					if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
+						String str = "";
 						// read the blob content
-						String str = JGitUtils.getStringContent(repository, commit.getTree(),
+						if (path.isFilestoreItem()) {
+							//Get file from filestore
+							BodyContentHandler handler = new BodyContentHandler();
+	                        Metadata metadata = new Metadata();
+	                        PDFParser parser = new PDFParser();
+	                        
+	                        ParseContext parseContext = new ParseContext();
+	                        File lfsFile = filestoreManager.getStoragePath(path.getFilestoreOid());
+	                        FileInputStream inputstream = new FileInputStream(lfsFile);
+	                        parser.parse(inputstream, handler, metadata, parseContext);
+							str = handler.toString();
+						} else {
+							str = JGitUtils.getStringContent(repository, commit.getTree(),
 								path.path, encodings);
+						}
+						
 						if (str != null) {
-							doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED));
+							doc.add(new Field(FIELD_CONTENT, str, TextField.TYPE_STORED));
 							writer.addDocument(doc);
 						}
 					}
@@ -700,7 +728,7 @@
 
 			// create and write the Lucene document
 			Document doc = createDocument(commit, commitTags);
-			doc.add(new Field(FIELD_BRANCH, branch, Store.YES, Index.ANALYZED));
+			doc.add(new Field(FIELD_BRANCH, branch, TextField.TYPE_STORED));
 			result.commitCount++;
 			result.success = index(repositoryName, doc);
 		} catch (Exception e) {
@@ -761,7 +789,7 @@
 					// skip non-annotated tags
 					continue;
 				}
-				if (!tags.containsKey(tag.getObjectId())) {
+				if (!tags.containsKey(tag.getObjectId().getName())) {
 					tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>());
 				}
 				tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName);
@@ -808,8 +836,8 @@
 						&& branch.equals(defaultBranch)) {
 					// indexing "default" branch
 					indexBranch = true;
-				} else if (branch.getName().startsWith(com.gitblit.Constants.R_GITBLIT)) {
-					// ignore internal Gitblit branches
+				} else if (branch.getName().startsWith(com.gitblit.Constants.R_META)) {
+					// ignore internal meta branches
 					indexBranch = false;
 				} else {
 					// normal explicit branch check
@@ -880,17 +908,16 @@
 	 */
 	private Document createDocument(RevCommit commit, List<String> tags) {
 		Document doc = new Document();
-		doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.commit.name(), Store.YES,
-				Index.NOT_ANALYZED));
-		doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED));
+		doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.commit.name(), StringField.TYPE_STORED));
+		doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED));
 		doc.add(new Field(FIELD_DATE, DateTools.timeToString(commit.getCommitTime() * 1000L,
-				Resolution.MINUTE), Store.YES, Index.NO));
-		doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED));
-		doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED));
-		doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), Store.YES, Index.ANALYZED));
-		doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.YES, Index.ANALYZED));
+				Resolution.MINUTE), StringField.TYPE_STORED));
+		doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), TextField.TYPE_STORED));
+		doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), TextField.TYPE_STORED));
+		doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), TextField.TYPE_STORED));
+		doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), TextField.TYPE_STORED));
 		if (!ArrayUtils.isEmpty(tags)) {
-			doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), Store.YES, Index.ANALYZED));
+			doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), TextField.TYPE_STORED));
 		}
 		return doc;
 	}
@@ -952,7 +979,7 @@
 		IndexSearcher searcher = searchers.get(repository);
 		if (searcher == null) {
 			IndexWriter writer = getIndexWriter(repository);
-			searcher = new IndexSearcher(IndexReader.open(writer, true));
+			searcher = new IndexSearcher(DirectoryReader.open(writer, true));
 			searchers.put(repository, searcher);
 		}
 		return searcher;
@@ -1106,6 +1133,7 @@
 			content = "";
 		}
 
+		int tabLength = storedSettings.getInteger(Keys.web.tabLength, 4);
 		int fragmentLength = SearchObjectType.commit == result.type ? 512 : 150;
 
 		QueryScorer scorer = new QueryScorer(query, "content");
@@ -1128,7 +1156,7 @@
 			if (fragment.length() > fragmentLength) {
 				fragment = fragment.substring(0, fragmentLength) + "...";
 			}
-			return "<pre class=\"text\">" + StringUtils.escapeForHtml(fragment, true) + "</pre>";
+			return "<pre class=\"text\">" + StringUtils.escapeForHtml(fragment, true, tabLength) + "</pre>";
 		}
 
 		// make sure we have unique fragments
@@ -1226,25 +1254,14 @@
 	 */
 	private class MultiSourceReader extends MultiReader {
 
-		final Method method;
-
-		MultiSourceReader(IndexReader[] subReaders) {
-			super(subReaders);
-			Method m = null;
-			try {
-				m = MultiReader.class.getDeclaredMethod("readerIndex", int.class);
-				m.setAccessible(true);
-			} catch (Exception e) {
-				logger.error("Error getting readerIndex method", e);
-			}
-			method = m;
+		MultiSourceReader(IndexReader [] readers) {
+			super(readers, false);
 		}
 
 		int getSourceIndex(int docId) {
 			int index = -1;
 			try {
-				Object o = method.invoke(this, docId);
-				index = (Integer) o;
+				index = super.readerIndex(docId);
 			} catch (Exception e) {
 				logger.error("Error getting source index", e);
 			}

--
Gitblit v1.9.1