OpenTREP Logo  0.07.10
C++ Open Travel Request Parsing Library
IndexBuilder.cpp
Go to the documentation of this file.
1 // //////////////////////////////////////////////////////////////////////
2 // Import section
3 // //////////////////////////////////////////////////////////////////////
4 // STL
5 #include <cassert>
6 #include <string>
7 #include <vector>
8 #include <exception>
9 // Boost
10 #include <boost/filesystem.hpp>
11 #include <boost/filesystem/fstream.hpp>
12 #include <boost/tokenizer.hpp>
13 #include <boost/iostreams/device/file.hpp>
14 #include <boost/iostreams/filtering_stream.hpp>
15 #include <boost/iostreams/filter/gzip.hpp>
16 #include <boost/iostreams/filter/bzip2.hpp>
17 // SOCI
18 #include <soci/soci.h>
19 // Xapian
20 #include <xapian.h>
21 // OpenTrep
26 #include <opentrep/bom/World.hpp>
27 #include <opentrep/bom/Place.hpp>
36 
37 namespace OPENTREP {
38 
39  // //////////////////////////////////////////////////////////////////////
40  void addToXapian (const Place& iPlace, Xapian::Document& ioDocument,
41  Xapian::WritableDatabase& ioDatabase) {
48  Xapian::TermGenerator lTermGenerator;
49  lTermGenerator.set_database (ioDatabase);
50  lTermGenerator.set_document (ioDocument);
51 
52  // DEBUG
53  // OPENTREP_LOG_DEBUG ("Indexing for " << iPlace.describeKey());
54 
55  const Place::TermSetMap_T& lTermSetMap = iPlace.getTermSetMap();
56  for (Place::TermSetMap_T::const_iterator itStringSet = lTermSetMap.begin();
57  itStringSet != lTermSetMap.end(); ++itStringSet) {
58  // Retrieve the weight
59  const Weight_T& lWeight = itStringSet->first;
60  const Xapian::termcount lWDFInc =
61  static_cast<const Xapian::termcount> (lWeight);
62 
63  // Retrieve the set of strings for that weight
64  const Place::StringSet_T& lTermSet = itStringSet->second;
65  for (Place::StringSet_T::const_iterator itString = lTermSet.begin();
66  itString != lTermSet.end(); ++itString) {
67  const std::string& lString = *itString;
68  lTermGenerator.index_text (lString, lWDFInc);
69 
70  // DEBUG
71  //OPENTREP_LOG_DEBUG("[" << lWeight << "/" << lWDFInc << "] "<< lString);
72  }
73  }
74 
75  // Spelling terms
76  const Place::StringSet_T& lSpellingSet = iPlace.getSpellingSet();
77  for (Place::StringSet_T::const_iterator itTerm = lSpellingSet.begin();
78  itTerm != lSpellingSet.end(); ++itTerm) {
79  const std::string& lTerm = *itTerm;
80  ioDatabase.add_spelling (lTerm);
81  }
82 
83  // DEBUG
84  OPENTREP_LOG_DEBUG ("Added terms for '" << iPlace.describeKey()
85  << "': " << iPlace.describeSets()
86  << " into " << ioDocument.get_description());
87  }
88 
89  // //////////////////////////////////////////////////////////////////////
90  void IndexBuilder::addDocumentToIndex(Xapian::WritableDatabase& ioDatabase,
91  Place& ioPlace,
92  const OTransliterator& iTransliterator) {
93 
94  // Create an empty Xapian document
95  Xapian::Document lDocument;
96 
97  // Retrieve the raw data string, to be stored as is within
98  // the Xapian document
99  const RawDataString_T& lRawDataString = ioPlace.getRawDataString();
100 
101  // The Xapian document data is indeed the same as the one of the
102  // OPTD-maintained list of POR (points of reference), allowing the search
103  // process to use exactly the same parser as the indexation process
104  lDocument.set_data (lRawDataString);
105 
106  // Build the (STL) sets of terms to be added to the Xapian index and
107  // spelling dictionary
108  ioPlace.buildIndexSets (iTransliterator);
109 
110  // Add the (STL) sets of terms to the Xapian index and spelling dictionary
111  addToXapian (ioPlace, lDocument, ioDatabase);
112 
113  // Add the document to the database
114  const Xapian::docid& lDocID = ioDatabase.add_document (lDocument);
115 
116  // Assign back the newly generated Xapian document ID to the
117  // Place object
118  ioPlace.setDocID (lDocID);
119  }
120 
121  // //////////////////////////////////////////////////////////////////////
122  NbOfDBEntries_T IndexBuilder::
123  buildSearchIndex (Xapian::WritableDatabase* ioXapianDB_ptr,
124  const DBType& iSQLDBType, soci::session* ioSociSessionPtr,
125  std::istream& iPORFileStream,
126  const shouldIndexNonIATAPOR_T& iIncludeNonIATAPOR,
127  const OTransliterator& iTransliterator) {
128  NbOfDBEntries_T oNbOfEntries = 0;
129  NbOfDBEntries_T oNbOfEntriesInPORFile = 0;
130 
131  // Open the file to be parsed
132  Place& lPlace = FacPlace::instance().create();
133  std::string itReadLine;
134  while (std::getline (iPORFileStream, itReadLine)) {
135 
136  /* First, if only the IATA-refernced POR must be indexed
137  * (ie, when iIncludeNonIATAPOR is set to false), the line
138  * must start with a non empty IATA code of three letters;
139  * in other words, the separator (the hat symbol) is first seen
140  * at position 3 (remember that strings in C++ start at position 0).
141  * Otherwise, the line is skipped.
142  */
143  if (!iIncludeNonIATAPOR) {
144  const unsigned short lFirstSeparatorPos = itReadLine.find_first_of ("^");
145  if (lFirstSeparatorPos != 3) {
146  // DEBUG
147  /*
148  OPENTREP_LOG_ERROR ("[" << oNbOfEntries << "] pos of sep: "
149  << lFirstSeparatorPos << ", full line: "
150  << itReadLine);
151  */
152 
153  //
154  ++oNbOfEntriesInPORFile;
155 
156  //
157  continue;
158  }
159  }
160 
161  // Initialise the parser
162  PORStringParser lStringParser (itReadLine);
163 
164  // Parse the string
165  const Location& lLocation = lStringParser.generateLocation();
166 
167  // DEBUG
168  /*
169  const LocationKey& lLocationKey = lLocation.getKey();
170  OPENTREP_LOG_DEBUG ("[BEF-ADD] " << lLocationKey);
171  */
172 
173  /* When the line/string is relevant, create a BOM instance from
174  * the Location structure.
175  * Otherwise, the line is skipped.
176  */
177  const std::string& lCommonName = lLocation.getCommonName();
178  if (lCommonName == "NotAvailable") {
179  continue;
180  }
181 
182  // Fill the Place object with the Location structure.
183  lPlace.setLocation (lLocation);
184 
185  // Add the document, associated to the Place object, to the Xapian index,
186  // if required
187  if (ioXapianDB_ptr != NULL) {
188  IndexBuilder::addDocumentToIndex (*ioXapianDB_ptr, lPlace,
189  iTransliterator);
190  }
191 
192  // Add the document to the SQL database, if required
193  if (ioSociSessionPtr != NULL) {
194  DBManager::insertPlaceInDB (*ioSociSessionPtr, lPlace);
195  }
196 
197  // DEBUG
198  /*
199  OPENTREP_LOG_DEBUG ("[AFT-ADD] " << lLocationKey
200  << ", Place: " << lPlace);
201  */
202 
203  // Iteration
204  ++oNbOfEntries; ++oNbOfEntriesInPORFile;
205 
206  // Progress status
207  if (oNbOfEntries % 1000 == 0) {
208  std::cout.imbue( std::locale (std::locale::classic(), new NumSep));
209  std::cout << "Number of actually parsed records: " << oNbOfEntries
210  << ", out of " << oNbOfEntriesInPORFile
211  << " records in the POR data file so far" << std::endl;
212  }
213 
214  // DEBUG
215  OPENTREP_LOG_DEBUG ("[" << oNbOfEntries << "] " << lPlace);
216 
217  // Reset for next turn
218  lPlace.resetMatrix();
219  lPlace.resetIndexSets();
220  }
221 
222  return oNbOfEntries;
223  }
224 
225  // //////////////////////////////////////////////////////////////////////
226  NbOfDBEntries_T IndexBuilder::
227  buildSearchIndex (const PORFilePath_T& iPORFilePath,
228  const TravelDBFilePath_T& iTravelIndexFilePath,
229  const DBType& iSQLDBType,
230  const SQLDBConnectionString_T& iSQLDBConnStr,
231  const shouldIndexNonIATAPOR_T& iIncludeNonIATAPOR,
232  const shouldIndexPORInXapian_T& iShouldIndexPORInXapian,
233  const shouldAddPORInSQLDB_T& iShouldAddPORInSQLDB,
234  const OTransliterator& iTransliterator) {
235  NbOfDBEntries_T oNbOfEntries = 0;
236  soci::session* lSociSession_ptr = NULL;
237  Xapian::WritableDatabase* lXapianDatabase_ptr = NULL;
238 
247  if (iShouldIndexPORInXapian) {
248  // Delete and recreate the directory, and its full content,
249  // hosting the Xapian index / database
250  FileManager::recreateXapianDirectory (iTravelIndexFilePath);
251 
252  // Recreate the Xapian index / database
253  lXapianDatabase_ptr =
254  FacXapianDB::instance().create (iTravelIndexFilePath, Xapian::DB_CREATE);
255  assert (lXapianDatabase_ptr != NULL);
256 
257  // DEBUG
258  OPENTREP_LOG_DEBUG ("The Xapian index / database ('"
259  << iTravelIndexFilePath
260  << "') has been re-created, checked and opened");
261 
262 
271  lXapianDatabase_ptr->begin_transaction();
272 
273  // DEBUG
274  OPENTREP_LOG_DEBUG ("A transaction has begun on the Xapian database ('"
275  << iTravelIndexFilePath << "')");
276  }
277 
283  if (iShouldAddPORInSQLDB) {
284  /*
285  // Creation of the trep user and trep_trep database
286  bool isSuccessful = DBManager::createSQLDBUser (iSQLDBType, iSQLDBConnStr);
287  if (isSuccessful == false) {
288  std::ostringstream errorStr;
289  errorStr << "Error when trying to re-initialize the SQL database ('"
290  << iSQLDBConnStr << "')";
291  OPENTREP_LOG_ERROR (errorStr.str());
292  throw SQLDatabaseImpossibleConnectionException (errorStr.str());
293  }
294  */
295 
299  if (!(iSQLDBType == DBType::NODB)) {
300  // Connection to the database
301  lSociSession_ptr =
302  DBManager::initSQLDBSession (iSQLDBType, iSQLDBConnStr);
303 
304  if (lSociSession_ptr == NULL) {
305  std::ostringstream errorStr;
306  errorStr << "Error when trying to connect to the SQL database ('"
307  << iSQLDBConnStr << "')";
308  OPENTREP_LOG_ERROR (errorStr.str());
309  throw SQLDatabaseImpossibleConnectionException (errorStr.str());
310  }
311  assert (lSociSession_ptr != NULL);
312 
313  // Creation of the POR table
314  DBManager::createSQLDBTables (*lSociSession_ptr);
315  }
316  }
317 
321  // DEBUG
322  OPENTREP_LOG_DEBUG ("Parsing POR input file: " << iPORFilePath);
323 
324  // Get a reference on the file stream corresponding to the POR file.
325  const PORFileHelper lPORFileHelper (iPORFilePath);
326  std::istream& lPORFileStream = lPORFileHelper.getFileStreamRef();
327 
328  // Browse the input POR (point of reference) data file,
329  // parse every of its rows, and put the result in the Xapian database/index
330  // and, if needed, within the SQL database.
331  oNbOfEntries = buildSearchIndex (lXapianDatabase_ptr, iSQLDBType,
332  lSociSession_ptr, lPORFileStream,
333  iIncludeNonIATAPOR, iTransliterator);
334 
339  if (iShouldIndexPORInXapian) {
340  assert (lXapianDatabase_ptr != NULL);
341  lXapianDatabase_ptr->commit_transaction();
342 
343  // DEBUG
344  OPENTREP_LOG_DEBUG ("Xapian has indexed " << oNbOfEntries << " entries.");
345  }
346 
354  if (iShouldIndexPORInXapian) {
355  assert (lXapianDatabase_ptr != NULL);
356  lXapianDatabase_ptr->close();
357  }
358 
359 
360  if (iShouldAddPORInSQLDB) {
364  if (!(iSQLDBType == DBType::NODB)) {
365  assert (lSociSession_ptr != NULL);
366  DBManager::createSQLDBIndexes (*lSociSession_ptr);
367  }
368 
372  if (!(iSQLDBType == DBType::NODB)) {
373  assert (lSociSession_ptr != NULL);
374  DBManager::terminateSQLDBSession (iSQLDBType, iSQLDBConnStr,
375  *lSociSession_ptr);
376  }
377  }
378 
379  return oNbOfEntries;
380  }
381 
382 }
#define OPENTREP_LOG_ERROR(iToBeLogged)
Definition: Logger.hpp:24
#define OPENTREP_LOG_DEBUG(iToBeLogged)
Definition: Logger.hpp:33
static void terminateSQLDBSession(const DBType &, const SQLDBConnectionString_T &, soci::session &)
Definition: DBManager.cpp:406
static void createSQLDBTables(soci::session &)
Definition: DBManager.cpp:460
static soci::session * initSQLDBSession(const DBType &, const SQLDBConnectionString_T &)
Definition: DBManager.cpp:318
static void createSQLDBIndexes(soci::session &)
Definition: DBManager.cpp:595
static void insertPlaceInDB(soci::session &, const Place &)
Definition: DBManager.cpp:954
Place & create()
Definition: FacPlace.cpp:41
static FacPlace & instance()
Definition: FacPlace.cpp:29
static FacXapianDB & instance()
Definition: FacXapianDB.cpp:39
Xapian::WritableDatabase * create(const TravelDBFilePath_T &, const int &iXapianActionFlag)
Definition: FacXapianDB.cpp:50
static void recreateXapianDirectory(const std::string &iTravelDBFilePath)
Definition: FileManager.cpp:52
Class modelling a place/POR (point of reference).
Definition: Place.hpp:29
const TermSetMap_T & getTermSetMap() const
Definition: Place.hpp:495
std::map< const Weight_T, StringSet_T > TermSetMap_T
Definition: Place.hpp:41
std::string describeSets() const
Definition: Place.cpp:157
const StringSet_T & getSpellingSet() const
Definition: Place.hpp:509
std::set< std::string > StringSet_T
Definition: Place.hpp:40
std::string describeKey() const
Definition: Place.hpp:1053
void addToXapian(const Place &iPlace, Xapian::Document &ioDocument, Xapian::WritableDatabase &ioDatabase)
unsigned short Weight_T
bool shouldAddPORInSQLDB_T
unsigned int NbOfDBEntries_T
bool shouldIndexPORInXapian_T
bool shouldIndexNonIATAPOR_T