c# - Try to answer some boolean queries using Term-Document-Incidence-Matrix -


i try answer simple boolean query in these ways not x not y not z x , y , z , x or y or z x,y,z words , of them belongs different file.txtor maybe of them belongs same file.txt, no matter .

i've written class termdocmatrix:

it must able answer boolean query , prepared methods in class termdocmatrix{ }for these purposes doesn't work. debug code step step, realized loops have no turn. don't know why, codes seems fine.

well can see code in here :

class termdocmatrix {     //stores distinct terms     public hashset<string> distinctterm = new hashset<string>();     //stores document id , contents without splitting     public dictionary<int, string> documentcontentlist = new dictionary<int, string>();     //stores document , terms collection     public dictionary<string, list<string>> documentcollection = new dictionary<string, list<string>>();     public dictionary<string, list<int>> termdocumentincidencematrix = new dictionary<string, list<int>>();       //stop words collection     public list<string> stopwords = new list<string> { "on", "of", "the", "an", "a", "in" };     //boolean operators list     public string[] booleanoperator = new string[] { "and", "or", "not" };      private string _filename = "words";     public string _path = "";     int _lastdocnum = 0;      public termdocmatrix(string indexpath,string filename)     {          if (_path.endswith("\\") == false) _path += "\\";         if (!directory.exists(indexpath)) directory.createdirectory(indexpath);         logmanager.configure(_path + _filename + ".txt", false);         // read files         loadfiles();     }      private void loadfiles()     {         int count = 0;          if (file.exists(_path + _filename + ".txt") == false)             return;         // load words         string b = file.readalltext(_path + _filename + ".txt");         string[] termscollection = removestopswords(b.toupper().split(' '));         foreach (string term in termscollection)         {             //prepeare distinct terms collection             //remove stop words             if (!stopwords.contains(term))             {                 distinctterm.add(term);             }         }         //add document , terms collection         documentcollection.add(_filename, termscollection.tolist());         //add document , content displaying search result         documentcontentlist.add(count, b);         count++;     }     public string processfiles(string query)     {         termdocumentincidencematrix = gettermdocumentincidencematrix(distinctterm, documentcollection);                 {             list<int> lst = processquery(query);             int count = 0;             if (lst != null)             {                 foreach (int in lst)                 {                     if (a == 1)                     {                          return documentcontentlist[count];                     }                     count++;                 }             }             else             {                 return "no search result found";             }          } while (1 == 1);     }     public int wordcount()     {         return documentcollection.count;     }      public int documentcount     {                 {             return _lastdocnum;         }     }      private void filterqueryterm(ref string[] str)     {         list<string> _queryterm = new list<string>();           foreach (string queryterm in str)         {             if (queryterm.toupper().equals("but") || termdocumentincidencematrix.containskey(queryterm.toupper()) || booleanoperator.contains(queryterm))             {                 _queryterm.add(queryterm);              }         }          str = _queryterm.toarray();     }      //prepares term document incidence matrix     public dictionary<string, list<int>> gettermdocumentincidencematrix(hashset<string> distinctterms, dictionary<string, list<string>> documentcollection)     {         dictionary<string, list<int>> termdocumentincidencematrix = new dictionary<string, list<int>>();         list<int> incidencevector = new list<int>();         foreach (string term in distinctterms)         {             //incidence vector each terms             incidencevector = new list<int>();             foreach (keyvaluepair<string, list<string>> p in documentcollection)             {                  if (p.value.contains(term))                 {                     //document contains term                     incidencevector.add(1);                  }                 else                 {                     //document not contains term                     incidencevector.add(0);                 }             }             termdocumentincidencematrix.add(term, incidencevector);          }         return termdocumentincidencematrix;     }     //removes stop words     public string[] removestopswords(string[] str)     {         list<string> terms = new list<string>();         foreach (string term in str)         {             if (!stopwords.contains(term))             {                 terms.add(term);             }         }         return terms.toarray();     }     //process boolean query     public list<int> processquery(string query)     {          //query boolean operator         string bitwiseop = string.empty;         string[] queryterm = removestopswords(query.toupper().split(' '));          //remove query term doesnot appears on document collection         filterqueryterm(ref queryterm);         list<int> previoustermincidencev = null;         list<int> nexttermsincidencev = null;         //holds bitwise operation result         list<int> resultset = null;         //suppose on query x , y, x previousterm term , y nextterm         boolean haspreviousterm = false;         boolean hasnotoperation = false;         foreach (string term in queryterm)         {             //is term             if (!booleanoperator.contains(term) && !term.equals("but"))             {                 //query case: structure , not analysis                 if (hasnotoperation)                 {                      if (haspreviousterm)                     {                         nexttermsincidencev = processbooleanoperator("not", gettermincidencevector(term), nexttermsincidencev);                     }                     //query case: eg.not analysis                     else                     {                         previoustermincidencev = processbooleanoperator("not", gettermincidencevector(term), nexttermsincidencev);                         resultset = previoustermincidencev;                     }                     hasnotoperation = false;                 }                 else if (!haspreviousterm)                 {                     previoustermincidencev = gettermincidencevector(term);                     resultset = previoustermincidencev;                     haspreviousterm = true;                 }                 else                 {                      nexttermsincidencev = gettermincidencevector(term);                 }             }             else if (term.equals("not"))             {                 //indicates  term in next iteration should complemented.                 hasnotoperation = true;             }             else             {                 //'but' should evaluated , eg. structure not semantic should evaluated structure , not semantic                 if (term.equals("but"))                 {                     bitwiseop = "and";                 }                 else                     bitwiseop = term;             }              if (nexttermsincidencev != null && !hasnotoperation)             {                 resultset = processbooleanoperator(bitwiseop, previoustermincidencev, nexttermsincidencev);                 previoustermincidencev = resultset;                 haspreviousterm = true;                 nexttermsincidencev = null;             }         }          return resultset;     }      //process boolean operators     public list<int> processbooleanoperator(string op, list<int> previoustermv, list<int> nexttermv)     {         list<int> resultset = new list<int>();         if (op.equals("not"))         {             foreach (int in previoustermv)             {                 if (a == 1)                 {                     resultset.add(0);                 }                 else                 {                     resultset.add(1);                 }             }         }         else if (op.toupper().equals("and")) //bitwise , operation         {             (int = 0; < previoustermv.count; a++)             {                 if (previoustermv[a] == 1 && nexttermv[a] == 1)                 {                     resultset.add(1);                 }                 else                 {                     resultset.add(0);                 }             }         }         else if (op.toupper().equals("or")) //bitwise or operation         {             (int = 0; < previoustermv.count; a++)             {                 if (previoustermv[a] == 0 && nexttermv[a] == 0)                 {                     resultset.add(0);                 }                 else                 {                     resultset.add(1);                 }             }         }         return resultset;     }      //returns term incidence vector     public list<int> gettermincidencevector(string term)     {         return termdocumentincidencematrix[term.toupper()];      } } 

you need class named class logmanager used on class termdocmatrix. here :

namespace windowsformsapplication1  {    internal class filelogger    {     public static readonly filelogger instance = new filelogger();      private string _filename;     private bool _showmethodname = false;     private string _filepath = "";      public bool showmethodnames     {         { return _showmethodname; }     }      public void init(string filename, bool showmethodnames)     {         _showmethodname = showmethodnames;         _filename = filename;         // handle folder names -> create dir etc.         _filepath = path.getdirectoryname(filename);         if (_filepath != "")         {             _filepath = directory.createdirectory(_filepath).fullname;             if (_filepath.endswith("\\") == false)                 _filepath += "\\";         }     } } internal static class logmanager {      public static void configure(string filename, bool showmethodnames)     {         filelogger.instance.init(filename, showmethodnames);     } } 

}

it must work don't. tell me why don't work, please. when ask answer see "no search result found", no matter i'd typed kind of boolean query.

your problem in line: (processfiles function)

string[] termscollection = removestopswords(file.toupper().split(' ')); 

you're splitting name of file , not content that's why have no search results

you should instead:

string[] termscollection = removestopswords(file.readalltext(file).toupper().split(' ')); 

now change termdocmatrix constructor:

public termdocmatrix(string indexpath,string filename) {     if (!directory.exists(indexpath)) directory.createdirectory(indexpath);     logmanager.configure(system.io.path.combine(_path, _filename + ".txt"), false);     // read files     loadfiles(); } 

and loadfiles function:

private void loadfiles() {     int count = 0;      if (file.exists(system.io.path.combine(_path, _filename + ".txt")) == false)         return;     // load words     string b = file.readalltext(system.io.path.combine(_path, _filename + ".txt"));      ..... } 

Comments

Popular posts from this blog

linux - xterm copying to CLIPBOARD using copy-selection causes automatic updating of CLIPBOARD upon mouse selection -

c++ - qgraphicsview horizontal scrolling always has a vertical delta -