c# - Try to answer some boolean queries using Term-Document-Incidence-Matrix -
i try answer simple boolean query in these ways not x not y not z
x , y , z
, x or y or z
x,y,z
words , of them belongs different file.txt
or maybe of them belongs same file.txt
, no matter .
i've written class termdocmatrix
:
it must able answer boolean query , prepared methods in class termdocmatrix{ }
for these purposes doesn't work. debug code step step, realized loops have no turn. don't know why, codes seems fine.
well can see code in here :
class termdocmatrix { //stores distinct terms public hashset<string> distinctterm = new hashset<string>(); //stores document id , contents without splitting public dictionary<int, string> documentcontentlist = new dictionary<int, string>(); //stores document , terms collection public dictionary<string, list<string>> documentcollection = new dictionary<string, list<string>>(); public dictionary<string, list<int>> termdocumentincidencematrix = new dictionary<string, list<int>>(); //stop words collection public list<string> stopwords = new list<string> { "on", "of", "the", "an", "a", "in" }; //boolean operators list public string[] booleanoperator = new string[] { "and", "or", "not" }; private string _filename = "words"; public string _path = ""; int _lastdocnum = 0; public termdocmatrix(string indexpath,string filename) { if (_path.endswith("\\") == false) _path += "\\"; if (!directory.exists(indexpath)) directory.createdirectory(indexpath); logmanager.configure(_path + _filename + ".txt", false); // read files loadfiles(); } private void loadfiles() { int count = 0; if (file.exists(_path + _filename + ".txt") == false) return; // load words string b = file.readalltext(_path + _filename + ".txt"); string[] termscollection = removestopswords(b.toupper().split(' ')); foreach (string term in termscollection) { //prepeare distinct terms collection //remove stop words if (!stopwords.contains(term)) { distinctterm.add(term); } } //add document , terms collection documentcollection.add(_filename, termscollection.tolist()); //add document , content displaying search result documentcontentlist.add(count, b); count++; } public string processfiles(string query) { termdocumentincidencematrix = gettermdocumentincidencematrix(distinctterm, documentcollection); { list<int> lst = processquery(query); int count = 0; if (lst != null) { foreach (int in lst) { if (a == 1) { return documentcontentlist[count]; } count++; } } else { return "no search result found"; } } while (1 == 1); } public int wordcount() { return documentcollection.count; } public int documentcount { { return _lastdocnum; } } private void filterqueryterm(ref string[] str) { list<string> _queryterm = new list<string>(); foreach (string queryterm in str) { if (queryterm.toupper().equals("but") || termdocumentincidencematrix.containskey(queryterm.toupper()) || booleanoperator.contains(queryterm)) { _queryterm.add(queryterm); } } str = _queryterm.toarray(); } //prepares term document incidence matrix public dictionary<string, list<int>> gettermdocumentincidencematrix(hashset<string> distinctterms, dictionary<string, list<string>> documentcollection) { dictionary<string, list<int>> termdocumentincidencematrix = new dictionary<string, list<int>>(); list<int> incidencevector = new list<int>(); foreach (string term in distinctterms) { //incidence vector each terms incidencevector = new list<int>(); foreach (keyvaluepair<string, list<string>> p in documentcollection) { if (p.value.contains(term)) { //document contains term incidencevector.add(1); } else { //document not contains term incidencevector.add(0); } } termdocumentincidencematrix.add(term, incidencevector); } return termdocumentincidencematrix; } //removes stop words public string[] removestopswords(string[] str) { list<string> terms = new list<string>(); foreach (string term in str) { if (!stopwords.contains(term)) { terms.add(term); } } return terms.toarray(); } //process boolean query public list<int> processquery(string query) { //query boolean operator string bitwiseop = string.empty; string[] queryterm = removestopswords(query.toupper().split(' ')); //remove query term doesnot appears on document collection filterqueryterm(ref queryterm); list<int> previoustermincidencev = null; list<int> nexttermsincidencev = null; //holds bitwise operation result list<int> resultset = null; //suppose on query x , y, x previousterm term , y nextterm boolean haspreviousterm = false; boolean hasnotoperation = false; foreach (string term in queryterm) { //is term if (!booleanoperator.contains(term) && !term.equals("but")) { //query case: structure , not analysis if (hasnotoperation) { if (haspreviousterm) { nexttermsincidencev = processbooleanoperator("not", gettermincidencevector(term), nexttermsincidencev); } //query case: eg.not analysis else { previoustermincidencev = processbooleanoperator("not", gettermincidencevector(term), nexttermsincidencev); resultset = previoustermincidencev; } hasnotoperation = false; } else if (!haspreviousterm) { previoustermincidencev = gettermincidencevector(term); resultset = previoustermincidencev; haspreviousterm = true; } else { nexttermsincidencev = gettermincidencevector(term); } } else if (term.equals("not")) { //indicates term in next iteration should complemented. hasnotoperation = true; } else { //'but' should evaluated , eg. structure not semantic should evaluated structure , not semantic if (term.equals("but")) { bitwiseop = "and"; } else bitwiseop = term; } if (nexttermsincidencev != null && !hasnotoperation) { resultset = processbooleanoperator(bitwiseop, previoustermincidencev, nexttermsincidencev); previoustermincidencev = resultset; haspreviousterm = true; nexttermsincidencev = null; } } return resultset; } //process boolean operators public list<int> processbooleanoperator(string op, list<int> previoustermv, list<int> nexttermv) { list<int> resultset = new list<int>(); if (op.equals("not")) { foreach (int in previoustermv) { if (a == 1) { resultset.add(0); } else { resultset.add(1); } } } else if (op.toupper().equals("and")) //bitwise , operation { (int = 0; < previoustermv.count; a++) { if (previoustermv[a] == 1 && nexttermv[a] == 1) { resultset.add(1); } else { resultset.add(0); } } } else if (op.toupper().equals("or")) //bitwise or operation { (int = 0; < previoustermv.count; a++) { if (previoustermv[a] == 0 && nexttermv[a] == 0) { resultset.add(0); } else { resultset.add(1); } } } return resultset; } //returns term incidence vector public list<int> gettermincidencevector(string term) { return termdocumentincidencematrix[term.toupper()]; } }
you need class named class logmanager
used on class termdocmatrix
. here :
namespace windowsformsapplication1 { internal class filelogger { public static readonly filelogger instance = new filelogger(); private string _filename; private bool _showmethodname = false; private string _filepath = ""; public bool showmethodnames { { return _showmethodname; } } public void init(string filename, bool showmethodnames) { _showmethodname = showmethodnames; _filename = filename; // handle folder names -> create dir etc. _filepath = path.getdirectoryname(filename); if (_filepath != "") { _filepath = directory.createdirectory(_filepath).fullname; if (_filepath.endswith("\\") == false) _filepath += "\\"; } } } internal static class logmanager { public static void configure(string filename, bool showmethodnames) { filelogger.instance.init(filename, showmethodnames); } }
}
it must work don't. tell me why don't work, please. when ask answer see "no search result found", no matter i'd typed kind of boolean query.
your problem in line: (processfiles
function)
string[] termscollection = removestopswords(file.toupper().split(' '));
you're splitting name of file , not content that's why have no search results
you should instead:
string[] termscollection = removestopswords(file.readalltext(file).toupper().split(' '));
now change termdocmatrix
constructor:
public termdocmatrix(string indexpath,string filename) { if (!directory.exists(indexpath)) directory.createdirectory(indexpath); logmanager.configure(system.io.path.combine(_path, _filename + ".txt"), false); // read files loadfiles(); }
and loadfiles
function:
private void loadfiles() { int count = 0; if (file.exists(system.io.path.combine(_path, _filename + ".txt")) == false) return; // load words string b = file.readalltext(system.io.path.combine(_path, _filename + ".txt")); ..... }
Comments
Post a Comment