// // LuceneQueryingDriver.cs // // Copyright (C) 2004-2005 Novell, Inc. // // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the "Software"), // to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. // using System; using System.Collections; using System.Diagnostics; using System.Globalization; using System.IO; using System.Text; using System.Threading; using System.Xml; using System.Xml.Serialization; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; using Lucene.Net.Documents; using Lucene.Net.Index; using Lucene.Net.QueryParsers; using LNS = Lucene.Net.Search; using Beagle.Util; namespace Beagle.Daemon { public class LuceneQueryingDriver : LuceneCommon { static public bool Debug = false; public const string PrivateNamespace = "_private:"; public delegate bool UriFilter (Uri uri); public delegate double RelevancyMultiplier (Hit hit); public LuceneQueryingDriver (string index_name, int minor_version, bool read_only) : base (index_name, minor_version) { // FIXME: Maybe the LuceneQueryingDriver should never try to create the index? if (Exists ()) Open (read_only); else if (!read_only) Create (); else { // We're in read-only mode, but we can't create an index. // Maybe a different exception would be better? This one is caught // in QueryDriver.LoadStaticQueryable () throw new InvalidOperationException (); } // Initialize the user text cache only if we're not in // read-only mode. StaticQueryables instantiate their // own text caches that are stored in a separate // location. if (!read_only) text_cache = TextCache.UserCache; } //////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////// public Uri[] PropertyQuery (Property prop) { // FIXME: Should we support scanning the secondary // index as well? IndexReader primary_reader; LNS.IndexSearcher primary_searcher; primary_reader = LuceneCommon.GetReader (PrimaryStore); primary_searcher = new LNS.IndexSearcher (primary_reader); Term term = new Term (PropertyToFieldName (prop.Type, prop.Key), prop.Value); LNS.TermQuery query = new LNS.TermQuery (term); LNS.Hits hits = primary_searcher.Search (query); Uri[] uri_list = new Uri [hits.Length ()]; for (int i = 0; i < hits.Length (); i++) { Document doc; doc = hits.Doc (i); uri_list [i] = GetUriFromDocument (doc); } primary_searcher.Close (); ReleaseReader (primary_reader); return uri_list; } //////////////////////////////////////////////////////////////// // Returns the lowest matching score before the results are // truncated. public void DoQuery (Query query, IQueryResult result, ICollection search_subset_uris, // should be internal uris UriFilter uri_filter, HitFilter hit_filter) { Stopwatch sw; sw = new Stopwatch (); sw.Start (); // Assemble all of the parts into a bunch of Lucene queries ArrayList primary_required_part_queries = null; ArrayList secondary_required_part_queries = null; LNS.BooleanQuery primary_prohibited_part_query = null; LNS.BooleanQuery secondary_prohibited_part_query = null; AndHitFilter all_hit_filters; all_hit_filters = new AndHitFilter (); if (hit_filter != null) all_hit_filters.Add (hit_filter); ArrayList term_list = new ArrayList (); foreach (QueryPart part in query.Parts) { LNS.Query primary_part_query; LNS.Query secondary_part_query; HitFilter part_hit_filter; QueryPartToQuery (part, false, // we want both primary and secondary queries part.Logic == QueryPartLogic.Required ? term_list : null, out primary_part_query, out secondary_part_query, out part_hit_filter); if (primary_part_query == null) continue; switch (part.Logic) { case QueryPartLogic.Required: if (primary_required_part_queries == null) { primary_required_part_queries = new ArrayList (); secondary_required_part_queries = new ArrayList (); } primary_required_part_queries.Add (primary_part_query); secondary_required_part_queries.Add (secondary_part_query); if (part_hit_filter != null) all_hit_filters.Add (part_hit_filter); break; case QueryPartLogic.Prohibited: if (primary_prohibited_part_query == null) primary_prohibited_part_query = new LNS.BooleanQuery (); primary_prohibited_part_query.Add (primary_part_query, false, false); if (secondary_part_query != null) { if (secondary_prohibited_part_query == null) secondary_prohibited_part_query = new LNS.BooleanQuery (); secondary_prohibited_part_query.Add (secondary_part_query, false, false); } if (part_hit_filter != null) { NotHitFilter nhf; nhf = new NotHitFilter (part_hit_filter); all_hit_filters.Add (new HitFilter (nhf.HitFilter)); } break; } } // If we have no required parts, give up. if (primary_required_part_queries == null) return; // // Now that we have all of these nice queries, let's execute them! // // Create the searchers that we will need. IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader = null; LNS.IndexSearcher secondary_searcher = null; primary_reader = LuceneCommon.GetReader (PrimaryStore); primary_searcher = new LNS.IndexSearcher (primary_reader); if (SecondaryStore != null) { secondary_reader = LuceneCommon.GetReader (SecondaryStore); if (secondary_reader.NumDocs () == 0) { ReleaseReader (secondary_reader); secondary_reader = null; } } if (secondary_reader != null) secondary_searcher = new LNS.IndexSearcher (secondary_reader); // Possibly create our whitelists from the search subset. LuceneBitArray primary_whitelist = null; LuceneBitArray secondary_whitelist = null; if (search_subset_uris != null && search_subset_uris.Count > 0) { primary_whitelist = new LuceneBitArray (primary_searcher); if (secondary_searcher != null) secondary_whitelist = new LuceneBitArray (secondary_searcher); foreach (Uri uri in search_subset_uris) { primary_whitelist.AddUri (uri); if (secondary_whitelist != null) secondary_whitelist.AddUri (uri); } primary_whitelist.FlushUris (); if (secondary_whitelist != null) secondary_whitelist.FlushUris (); } // Build blacklists from our prohibited parts. LuceneBitArray primary_blacklist = null; LuceneBitArray secondary_blacklist = null; if (primary_prohibited_part_query != null) { primary_blacklist = new LuceneBitArray (primary_searcher, primary_prohibited_part_query); if (secondary_searcher != null) { secondary_blacklist = new LuceneBitArray (secondary_searcher); if (secondary_prohibited_part_query != null) secondary_blacklist.Or (secondary_prohibited_part_query); primary_blacklist.Join (secondary_blacklist); } } // Combine our whitelist and blacklist into just a whitelist. if (primary_blacklist != null) { if (primary_whitelist == null) { primary_blacklist.Not (); primary_whitelist = primary_blacklist; } else { primary_whitelist.AndNot (primary_blacklist); } } if (secondary_blacklist != null) { if (secondary_whitelist == null) { secondary_blacklist.Not (); secondary_whitelist = secondary_blacklist; } else { secondary_whitelist.AndNot (secondary_blacklist); } } BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) primary_matches = DoRequiredQueries_TwoIndex (primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); else primary_matches = DoRequiredQueries (primary_searcher, primary_required_part_queries, primary_whitelist); } sw.Stop (); if (Debug) Logger.Log.Debug ("###### Finished low-level queries in {0}", sw); sw.Reset (); sw.Start (); // Only generate results if we got some matches if (primary_matches != null && primary_matches.ContainsTrue ()) { GenerateQueryResults (primary_reader, primary_searcher, secondary_searcher, primary_matches, result, term_list, query.MaxHits, uri_filter, new HitFilter (all_hit_filters.HitFilter)); } // // Finally, we clean up after ourselves. // primary_searcher.Close (); if (secondary_searcher != null) secondary_searcher.Close (); ReleaseReader (primary_reader); if (secondary_reader != null) ReleaseReader (secondary_reader); sw.Stop (); if (Debug) Logger.Log.Debug ("###### Processed query in {0}", sw); } //////////////////////////////////////////////////////////////// // // Special logic for handling our set of required queries // // This is the easy case: we just combine all of the queries // into one big BooleanQuery. private static BetterBitArray DoRequiredQueries (LNS.IndexSearcher primary_searcher, ArrayList primary_queries, BetterBitArray primary_whitelist) { LNS.BooleanQuery combined_query; combined_query = new LNS.BooleanQuery (); foreach (LNS.Query query in primary_queries) combined_query.Add (query, true, false); LuceneBitArray matches; matches = new LuceneBitArray (primary_searcher, combined_query); if (primary_whitelist != null) matches.And (primary_whitelist); return matches; } // This code attempts to execute N required queries in the // most efficient order to minimize the amount of time spent // joining between the two indexes. It returns a joined bit // array of matches against the primary index. private class MatchInfo : IComparable { public LuceneBitArray PrimaryMatches = null; public LuceneBitArray SecondaryMatches = null; public int UpperBound = 0; public void Join () { PrimaryMatches.Join (SecondaryMatches); } public void RestrictBy (MatchInfo joined) { if (joined != null) { this.PrimaryMatches.And (joined.PrimaryMatches); this.SecondaryMatches.And (joined.SecondaryMatches); } UpperBound = 0; UpperBound += PrimaryMatches.TrueCount; UpperBound += SecondaryMatches.TrueCount; } public int CompareTo (object obj) { MatchInfo other = (MatchInfo) obj; return this.UpperBound - other.UpperBound; } } // Any whitelists that are passed in must be fully joined, or // query results will be incorrect. private static BetterBitArray DoRequiredQueries_TwoIndex (LNS.IndexSearcher primary_searcher, LNS.IndexSearcher secondary_searcher, ArrayList primary_queries, ArrayList secondary_queries, BetterBitArray primary_whitelist, BetterBitArray secondary_whitelist) { ArrayList match_info_list; match_info_list = new ArrayList (); // First, do all of the low-level queries // and store them in our MatchInfo for (int i = 0; i < primary_queries.Count; ++i) { LNS.Query pq, sq; pq = primary_queries [i] as LNS.Query; sq = secondary_queries [i] as LNS.Query; LuceneBitArray p_matches = null, s_matches = null; p_matches = new LuceneBitArray (primary_searcher); if (pq != null) { p_matches.Or (pq); if (primary_whitelist != null) p_matches.And (primary_whitelist); } s_matches = new LuceneBitArray (secondary_searcher); if (sq != null) { s_matches.Or (sq); if (secondary_whitelist != null) s_matches.And (secondary_whitelist); } MatchInfo info; info = new MatchInfo (); info.PrimaryMatches = p_matches; info.SecondaryMatches = s_matches; info.RestrictBy (null); // a hack to initialize the UpperBound match_info_list.Add (info); } // We want to be smart about the order we do this in, // to minimize the expense of the Join. while (match_info_list.Count > 1) { // linear scan to find the minimum int index_min = 0; for (int i = 1; i < match_info_list.Count; ++i) if (((MatchInfo) match_info_list [i]).CompareTo ((MatchInfo) match_info_list [index_min]) < 0) index_min = i; MatchInfo smallest; smallest = match_info_list [index_min] as MatchInfo; match_info_list.RemoveAt (index_min); // We can short-circuit if our smallest set of // matches is empty. if (smallest.UpperBound == 0) return smallest.PrimaryMatches; // this must be an empty array. smallest.Join (); foreach (MatchInfo info in match_info_list) info.RestrictBy (smallest); } // For the final pair, we don't need to do a full join: // mapping the secondary onto the primary is sufficient MatchInfo last; last = match_info_list [0] as MatchInfo; last.SecondaryMatches.ProjectOnto (last.PrimaryMatches); return last.PrimaryMatches; } //////////////////////////////////////////////////////////////// static private void ScoreHits (Hashtable hits_by_id, IndexReader reader, ICollection term_list) { Stopwatch sw; sw = new Stopwatch (); sw.Start (); LNS.Similarity similarity; similarity = LNS.Similarity.GetDefault (); foreach (Term term in term_list) { double idf; idf = similarity.Idf (reader.DocFreq (term), reader.MaxDoc ()); int hit_count; hit_count = hits_by_id.Count; TermDocs term_docs; term_docs = reader.TermDocs (term); while (term_docs.Next () && hit_count > 0) { int id; id = term_docs.Doc (); Hit hit; hit = hits_by_id [id] as Hit; if (hit != null) { double tf; tf = similarity.Tf (term_docs.Freq ()); hit.Score += tf * idf; --hit_count; } } } sw.Stop (); } //////////////////////////////////////////////////////////////// private class DocAndId { public Document Doc; public int Id; } // // Given a set of hits, broadcast some set out as our query // results. // private static void GenerateQueryResults (IndexReader primary_reader, LNS.IndexSearcher primary_searcher, LNS.IndexSearcher secondary_searcher, BetterBitArray primary_matches, IQueryResult result, ICollection query_term_list, int max_results, UriFilter uri_filter, HitFilter hit_filter) { TopScores top_docs = null; ArrayList all_docs = null; if (Debug) Logger.Log.Debug (">>> Initially handed {0} matches", primary_matches.TrueCount); if (primary_matches.TrueCount <= max_results) { if (Debug) Logger.Log.Debug (">>> Initial count is within our limit of {0}", max_results); all_docs = new ArrayList (); } else { if (Debug) Logger.Log.Debug (">>> Number of hits is capped at {0}", max_results); top_docs = new TopScores (max_results); } Stopwatch total, a, b, c; total = new Stopwatch (); a = new Stopwatch (); b = new Stopwatch (); c = new Stopwatch (); total.Start (); a.Start (); // Pull in the primary documents. // We walk across them backwards, since newer // documents are more likely to be at the end of // the index. int j = primary_matches.Count; while (true) { int i; i = primary_matches.GetPreviousTrueIndex (j); if (i < 0) break; j = i-1; // This way we can't forget to adjust i Document doc; doc = primary_searcher.Doc (i); // Check the timestamp --- if we have already reached our // limit, we might be able to reject it immediately. string timestamp_str; long timestamp_num = 0; timestamp_str = doc.Get ("Timestamp"); if (timestamp_str == null) { Logger.Log.Warn ("No timestamp on {0}!", GetUriFromDocument (doc)); } else { timestamp_num = Int64.Parse (doc.Get ("Timestamp")); if (top_docs != null && ! top_docs.WillAccept (timestamp_num)) continue; } // If we have a UriFilter, apply it. if (uri_filter != null) { Uri uri; uri = GetUriFromDocument (doc); if (! uri_filter (uri)) continue; } DocAndId doc_and_id = new DocAndId (); doc_and_id.Doc = doc; doc_and_id.Id = i; // Add the document to the appropriate data structure. // We use the timestamp_num as the score, so high // scores correspond to more-recent timestamps. if (all_docs != null) all_docs.Add (doc_and_id); else top_docs.Add (timestamp_num, doc_and_id); } a.Stop (); b.Start (); ICollection final_list_of_docs; if (all_docs != null) final_list_of_docs = all_docs; else final_list_of_docs = top_docs.TopScoringObjects; ArrayList final_list_of_hits; final_list_of_hits = new ArrayList (final_list_of_docs.Count); // This is used only for scoring Hashtable hits_by_id = null; hits_by_id = new Hashtable (); // If we aren't using the secondary index, the next step is // very straightforward. if (secondary_searcher == null) { foreach (DocAndId doc_and_id in final_list_of_docs) { Hit hit; hit = DocumentToHit (doc_and_id.Doc); hits_by_id [doc_and_id.Id] = hit; final_list_of_hits.Add (hit); } } else { if (Debug) Logger.Log.Debug (">>> Performing cross-index Hit reunification"); Hashtable hits_by_uri; hits_by_uri = UriFu.NewHashtable (); LuceneBitArray secondary_matches; secondary_matches = new LuceneBitArray (secondary_searcher); foreach (DocAndId doc_and_id in final_list_of_docs) { Hit hit; hit = DocumentToHit (doc_and_id.Doc); hits_by_id [doc_and_id.Id] = hit; hits_by_uri [hit.Uri] = hit; secondary_matches.AddUri (hit.Uri); } secondary_matches.FlushUris (); // Attach all of our secondary properties // to the hits j = 0; while (true) { int i; i = secondary_matches.GetNextTrueIndex (j); if (i >= secondary_matches.Count) break; j = i+1; Document secondary_doc; secondary_doc = secondary_searcher.Doc (i); Uri uri; uri = GetUriFromDocument (secondary_doc); Hit hit; hit = hits_by_uri [uri] as Hit; AddPropertiesToHit (hit, secondary_doc, false); final_list_of_hits.Add (hit); } } ScoreHits (hits_by_id, primary_reader, query_term_list); b.Stop (); // If we used the TopScores object, we got our original // list of documents sorted for us. If not, sort the // final list. if (top_docs == null) final_list_of_hits.Sort (); c.Start (); // If we have a hit_filter, use it now. if (hit_filter != null) { for (int i = 0; i < final_list_of_hits.Count; ++i) { Hit hit; hit = final_list_of_hits [i] as Hit; if (! hit_filter (hit)) { if (Debug) Logger.Log.Debug ("Filtered out {0}", hit.Uri); final_list_of_hits [i] = null; } } } // Before we broadcast a hit, we strip out any // properties in the PrivateNamespace. We // manipulate the property ArrayList directory, // which is pretty gross... but this is safe, // since removing items will not change the sort // order. foreach (Hit hit in final_list_of_hits) { if (hit == null) continue; int i = 0; while (i < hit.Properties.Count) { Property prop = hit.Properties [i] as Property; if (prop.Key.StartsWith (PrivateNamespace)) hit.Properties.RemoveAt (i); else ++i; } } result.Add (final_list_of_hits); c.Stop (); total.Stop (); if (Debug) { Logger.Log.Debug (">>> GenerateQueryResults time statistics:"); Logger.Log.Debug (">>> First pass {0} ({1:0.0}%)", a, 100 * a.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> Hit assembly {0} ({1:0.0}%)", b, 100 * b.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> Final pass {0} ({1:0.0}%)", c, 100 * c.ElapsedTime / total.ElapsedTime); Logger.Log.Debug (">>> TOTAL {0}", total); } } } }