Merge pull request #69 from FlameOfIgnis/dev-ata

Performance updates
ignis-sec · Jun 10, 2021 · ad6f41b · ad6f41b
2 parents 361ccaa + 588b223
commit ad6f41b
Show file tree

Hide file tree

Showing 8 changed files with 201 additions and 153 deletions.
diff --git a/Makefile b/Makefile
@@ -25,7 +25,7 @@ PYTHON_VERSION3 :=$(shell python$(PYTHON_VERSION) -c "import sys;t='{v[0]}.{v[1]
 #####################################     MarkovPassword project options     #################################
 ##############################################################################################################
 
-MP_C_FLAGS  := -Wall -Wextra -g
+MP_C_FLAGS  := -Wall -Wextra -g -Ofast
 MP_EXEC     := Markov
 MP_SRC      := $(shell find ./MarkovPasswords/src/ -name '*.cpp') 
 MP_INC 		:= 
@@ -44,7 +44,7 @@ MM_SRC          := $(shell find $(MM_SRC_DIR) -name '*.cpp')
 MM_OBJS         := $(MM_SRC:%=$(BIN)/%.o)
 MM_DEPS         := $(MM_OBJS:.o=.d)
 MM_LDFLAGS      := -shared 
-MM_C_FLAGS      := $(MM_INC_FLAGS) -MMD -MP
+MM_C_FLAGS      := $(MM_INC_FLAGS) -MMD -MP  -Ofast
 MM_INC_DIRS     := $(shell find $(MM_SRC_DIR) -type d)
 MM_INC_FLAGS    := $(addprefix -I,$(MM_INC_DIRS))
 MM_LIB          := model.so
@@ -69,7 +69,7 @@ MPY_SRC_DIR		 := Markopy/src/
 MPY_OBJS         := $(MPY_SRC:%=$(BIN)/%.o)
 MPY_DEPS         := $(MPY_OBJS:.o=.d)
 MPY_LDFLAGS      := -shared -lboost_python$(PYTHON_VERSION_) -lpython$(PYTHON_VERSION) -lpthread
-MPY_C_FLAGS      := $(MPY_INC_FLAGS) -MMD -MP -fPIC -I/usr/include/python$(PYTHON_VERSION)
+MPY_C_FLAGS      := $(MPY_INC_FLAGS) -MMD -MP -fPIC -I/usr/include/python$(PYTHON_VERSION)  -Ofast
 MPY_INC_DIRS     := $(shell find $(MPY_SRC_DIR) -type d) $(shell pwd)
 MPY_INC_FLAGS    := $(addprefix -I,$(MPY_INC_DIRS))
 MPY_SO           := markopy.so

diff --git a/Markopy/src/CLI/markopy_cli.py b/Markopy/src/CLI/markopy_cli.py
@@ -6,16 +6,16 @@
 
 parser = argparse.ArgumentParser(description="Python wrapper for MarkovPasswords.",
 epilog=f"""Sample runs:
-{__file__} train -i untrained.mdl -d dataset.dat -s "\\t" -o trained.mdl
+{__file__} train untrained.mdl -d dataset.dat -s "\\t" -o trained.mdl
     Import untrained.mdl, train it with dataset.dat which has tab delimited data, output resulting model to trained.mdl\n
 
-{__file__} generate -i trained.mdl -n 500 -w output.txt
+{__file__} generate trained.mdl -n 500 -w output.txt
     Import trained.mdl, and generate 500 lines to output.txt
 
-{__file__} combine -i untrained.mdl -d dataset.dat -s "\\t" -n 500 -w output.txt
+{__file__} combine untrained.mdl -d dataset.dat -s "\\t" -n 500 -w output.txt
     Train and immediately generate 500 lines to output.txt. Do not export trained model.
 
-{__file__} combine -i untrained.mdl -d dataset.dat -s "\\t" -n 500 -w output.txt -o trained.mdl
+{__file__} combine untrained.mdl -d dataset.dat -s "\\t" -n 500 -w output.txt -o trained.mdl
     Train and immediately generate 500 lines to output.txt. Export trained model.
 """, formatter_class=argparse.RawTextHelpFormatter)
 parser.add_argument("mode",                             help="Operation mode, supported modes: \"generate\", \"train\" and \"combine\".")
@@ -29,12 +29,13 @@
 parser.add_argument("-n", "--count",                    help="Number of lines to generate. Ignored in training mode.")
 parser.add_argument("-t", "--threads",default=10,       help="Number of lines to generate. Ignored in training mode.")
 parser.add_argument("-v", "--verbosity",action="count", help="Output verbosity.")
+parser.add_argument("-b", "--bulk",action="store_true", help="Bulk generate or bulk train every corpus/model in the folder.")
 args = parser.parse_args() 
 
 
 
 
-def cli_init():
+def cli_init(input_model):
     logging.VERBOSITY = 0
     if args.verbosity:
         logging.VERBOSITY = args.verbosity
@@ -46,72 +47,103 @@ def cli_init():
 
     logging.pprint("Importing model file.", 1)
 
-    if(not os.path.isfile(args.input)):
-        logging.pprint(f"Model file at {args.input} not found. Check the file path, or working directory")
+    if(not os.path.isfile(input_model)):
+        logging.pprint(f"Model file at {input_model} not found. Check the file path, or working directory")
         exit(1)
 
-    model.Import(args.input)
+    model.Import(input_model)
     logging.pprint("Model imported successfully.", 2)
     return model
 
-def cli_train(model, output_forced=False):
-    if not (args.dataset and args.seperator and (args.output or not output_forced)):
+def cli_train(model, dataset, seperator, output, output_forced=False, bulk=False):
+    if not (dataset and seperator and (output or not output_forced)):
         logging.pprint(f"Training mode requires -d/--dataset{', -o/--output' if output_forced else''} and -s/--seperator parameters. Exiting.")
         exit(2)
 
-    if(not os.path.isfile(args.dataset)):
-        logging.pprint(f"{args.dataset} doesn't exists. Check the file path, or working directory")
+    if(not bulk and not os.path.isfile(dataset)):
+        logging.pprint(f"{dataset} doesn't exists. Check the file path, or working directory")
         exit(3)
 
-    if(args.output and os.path.isfile(args.output)):
-        logging.pprint(f"{args.output} exists and will be overwritten.",1 )
+    if(output and os.path.isfile(output)):
+        logging.pprint(f"{output} exists and will be overwritten.",1 )
 
-    if(args.seperator == '\\t'):
+    if(seperator == '\\t'):
         logging.pprint("Escaping seperator.", 3)
-        args.seperator = '\t'
+        seperator = '\t'
 
-    if(len(args.seperator)!=1):
-        logging.pprint(f'Delimiter must be a single character, and "{args.seperator}" is not accepted.')
+    if(len(seperator)!=1):
+        logging.pprint(f'Delimiter must be a single character, and "{seperator}" is not accepted.')
         exit(4)
 
     logging.pprint(f'Starting training.', 3)
-    model.Train(args.dataset,args.seperator, int(args.threads))
+    model.Train(dataset,seperator, int(args.threads))
     logging.pprint(f'Training completed.', 2)
 
-    if(args.output):
-        logging.pprint(f'Exporting model.', 2)
-        model.Export(args.output)
+    if(output):
+        logging.pprint(f'Exporting model to {output}', 2)
+        model.Export(output)
     else:
         logging.pprint(f'Model will not be exported.', 1)
 
-def cli_generate(model):
-    if not (args.wordlist and args.count):
+def cli_generate(model, wordlist, bulk=False):
+    if not (wordlist or args.count):
         logging.pprint("Generation mode requires -w/--wordlist and -n/--count parameters. Exiting.")
         exit(2)
 
-    if(os.path.isfile(args.wordlist)):
-        logging.pprint(f"{args.wordlist} exists and will be overwritten.", 1)
+    if(bulk and os.path.isfile(wordlist)):
+        logging.pprint(f"{wordlist} exists and will be overwritten.", 1)
+    model.Generate(int(args.count), wordlist, int(args.min), int(args.max), int(args.threads))
+
+
+if(args.bulk):
+    logging.pprint(f"Bulk mode operation chosen.", 4)
+
+    if (args.mode.lower() == "train"):
+        if (os.path.isdir(args.output) and not os.path.isfile(args.output)) and (os.path.isdir(args.dataset) and not os.path.isfile(args.dataset)):
+            corpus_list = os.listdir(args.dataset)
+            for corpus in corpus_list:
+                model = cli_init(args.input)
+                logging.pprint(f"Training {args.input} with {corpus}", 2)
+                output_file_name = corpus
+                model_extension = ""
+                if "." in args.input:
+                    model_extension = args.input.split(".")[-1]
+                cli_train(model, f"{args.dataset}/{corpus}", args.seperator, f"{args.output}/{corpus}.{model_extension}", output_forced=True, bulk=True)
+        else:
+            logging.pprint("In bulk training, output and dataset should be a directory.")
+            exit(1)
+
+    elif (args.mode.lower() == "generate"):
+        if (os.path.isdir(args.wordlist) and not os.path.isfile(args.wordlist)) and (os.path.isdir(args.input) and not os.path.isfile(args.input)):
+            model_list = os.listdir(args.input)
+            print(model_list)
+            for input in model_list:
+                logging.pprint(f"Generating from {args.input}/{input} to {args.wordlist}/{input}.txt", 2)
+
+                model = cli_init(f"{args.input}/{input}")
+                model_base = input
+                if "." in args.input:
+                    model_base = input.split(".")[1]
+                cli_generate(model, f"{args.wordlist}/{model_base}.txt", bulk=True)
+        else:
+            logging.pprint("In bulk generation, input and wordlist should be directory.")
 
-    model.Generate(int(args.count), args.wordlist, int(args.min), int(args.max), int(args.threads))
-
-
-
-
-model = cli_init()
-if (args.mode.lower() == "generate"):
-    cli_generate(model)
+else:
+    model = cli_init(args.input)
+    if (args.mode.lower() == "generate"):
+        cli_generate(model, args.wordlist)
 
 
-elif (args.mode.lower() == "train"):
-    cli_train(model, output_forced=True)
+    elif (args.mode.lower() == "train"):
+        cli_train(model, args.dataset, args.seperator, args.output, output_forced=True)
 
 
-elif(args.mode.lower() == "combine"):
-    cli_train(model)
-    cli_generate(model)
+    elif(args.mode.lower() == "combine"):
+        cli_train(model, args.dataset, args.seperator, args.output)
+        cli_generate(model, args.wordlist)
 
 
-else:
-    logging.pprint("Invalid mode arguement given.")
-    logging.pprint("Accepted modes: 'Generate', 'Train', 'Combine'")
-    exit(5)
+    else:
+        logging.pprint("Invalid mode arguement given.")
+        logging.pprint("Accepted modes: 'Generate', 'Train', 'Combine'")
+        exit(5)
diff --git a/MarkovModel/src/edge.h b/MarkovModel/src/edge.h
@@ -29,12 +29,12 @@ namespace Markov {
 		* Adds the offset parameter to the edge EdgeWeight.
 		* @param offset - NodeValue to be added to the EdgeWeight
 		*/
-		void AdjustEdge(uint64_t offset);
+		void AdjustEdge(long int offset);
 
 		/** @brief Traverse this edge to RightNode.
 		* @return Right node. If this is a terminator node, return NULL
 		*/
-		Node<NodeStorageType>* TraverseNode();
+		inline Node<NodeStorageType>* TraverseNode();
 
 		/** @brief Set LeftNode of this edge.
 		* @param node - Node to be linked with.
@@ -48,7 +48,7 @@ namespace Markov {
 		/** @brief return edge's EdgeWeight.
 		* @return edge's EdgeWeight.
 		*/
-		uint64_t EdgeWeight();
+		inline uint64_t EdgeWeight();
 
 		/** @brief return edge's LeftNode
 		* @return edge's LeftNode.
@@ -58,12 +58,12 @@ namespace Markov {
 		/** @brief return edge's RightNode
 		* @return edge's RightNode.
 		*/
-		Node<NodeStorageType>* RightNode();
+		inline Node<NodeStorageType>* RightNode();
 
 	private:
 		Node<NodeStorageType>* _left; /** @brief source node*/
 		Node<NodeStorageType>* _right;/** @brief target node*/
-		int _weight;    /** @brief Edge EdgeWeight*/
+		long int _weight;    /** @brief Edge EdgeWeight*/
 	};
 
 
@@ -85,13 +85,13 @@ Markov::Edge<NodeStorageType>::Edge(Markov::Node<NodeStorageType>* _left, Markov
 }
 //to AdjustEdge the edges by the edge with its offset
 template <typename NodeStorageType>
-void Markov::Edge<NodeStorageType>::AdjustEdge(uint64_t offset) {
+void Markov::Edge<NodeStorageType>::AdjustEdge(long int offset) {
 	this->_weight += offset;
 	this->LeftNode()->UpdateTotalVerticeWeight(offset);
 }
 //to TraverseNode the node
 template <typename NodeStorageType>
-Markov::Node<NodeStorageType>* Markov::Edge<NodeStorageType>::TraverseNode() {
+inline Markov::Node<NodeStorageType>* Markov::Edge<NodeStorageType>::TraverseNode() {
 	if (this->RightNode()->NodeValue() == 0xff) //terminator node
 		return NULL;
 	return _right;
@@ -108,7 +108,7 @@ void Markov::Edge<NodeStorageType>::SetRightEdge(Markov::Node<NodeStorageType>*
 }
 //to get the EdgeWeight of the node
 template <typename NodeStorageType>
-uint64_t Markov::Edge<NodeStorageType>::EdgeWeight() {
+inline uint64_t Markov::Edge<NodeStorageType>::EdgeWeight() {
 	return this->_weight;
 }
 //to get the LeftNode of the node
@@ -118,7 +118,7 @@ Markov::Node<NodeStorageType>* Markov::Edge<NodeStorageType>::LeftNode() {
 }
 //to get the RightNode of the node
 template <typename NodeStorageType>
-Markov::Node<NodeStorageType>* Markov::Edge<NodeStorageType>::RightNode() {
+inline Markov::Node<NodeStorageType>* Markov::Edge<NodeStorageType>::RightNode() {
 	return this->_right;
 }
 

diff --git a/MarkovModel/src/model.h b/MarkovModel/src/model.h
@@ -9,6 +9,7 @@
 #include <fstream>
 #include <assert.h>
 #include <string>
+#include <algorithm>
 #include "node.h"
 #include "edge.h"
 
@@ -38,7 +39,7 @@ namespace Markov {
 		* Start from the starter node, invoke RandomNext on current node until terminator node is reached.
 		* @return Null terminated string that was generated.
 		*/
-		NodeStorageType* RandomWalk(int minSetting, int maxSetting);
+		NodeStorageType* RandomWalk(Markov::Random::RandomEngine* randomEngine, int minSetting, int maxSetting, NodeStorageType* buffer);
 
 		/** @brief Adjust the model with a single string. 
 		* Start from the starter node, and for each character, AdjustEdge the edge EdgeWeight from current node to the next, until NULL character is reached.
@@ -113,15 +114,15 @@ bool Markov::Model<NodeStorageType>::Import(std::ifstream* f) {
 
 	char src;
 	char target;
-	int oc;
+	long int oc;
 
 	while (std::getline(*f, cell)) {
 		//std::cout << "cell: " << cell << std::endl;
 		src = cell[0];
 		target = cell[cell.length() - 1];
-		oc = std::atoi(cell.substr(2, cell.length() - 2).c_str());
-
-
+		char* j;
+		oc = std::strtol(cell.substr(2, cell.length() - 2).c_str(),&j,10);
+		//std::cout << oc << "\n";
 		Markov::Node<NodeStorageType>* srcN;
 		Markov::Node<NodeStorageType>* targetN;
 		Markov::Edge<NodeStorageType>* e;
@@ -151,6 +152,15 @@ bool Markov::Model<NodeStorageType>::Import(std::ifstream* f) {
 
 	}
 
+	for (std::pair<unsigned char, Markov::Node<NodeStorageType>*> const& x : this->nodes) {
+		//std::cout << "Total edges in EdgesV: " << x.second->edgesV.size() << "\n"; 
+		std::sort (x.second->edgesV.begin(), x.second->edgesV.end(), [](Edge<NodeStorageType> *lhs, Edge<NodeStorageType> *rhs)->bool{
+			return lhs->EdgeWeight() > rhs->EdgeWeight();
+		});
+		//for(int i=0;i<x.second->edgesV.size();i++)
+		//	std::cout << x.second->edgesV[i]->EdgeWeight() << ", ";
+		//std::cout << "\n";
+	}
 	//std::cout << "Total number of nodes: " << this->nodes.size() << std::endl;
 	//std::cout << "Total number of edges: " << this->edges.size() << std::endl;
 
@@ -185,43 +195,33 @@ bool Markov::Model<NodeStorageType>::Export(const char* filename) {
 }
 
 template <typename NodeStorageType>
-NodeStorageType* Markov::Model<NodeStorageType>::RandomWalk(int minSetting, int maxSetting) {
+NodeStorageType* Markov::Model<NodeStorageType>::RandomWalk(Markov::Random::RandomEngine* randomEngine, int minSetting, int maxSetting, NodeStorageType* buffer) {
 	Markov::Node<NodeStorageType>* n = this->starterNode;
 	int len = 0;
-	NodeStorageType* ret = new NodeStorageType[64];
 	Markov::Node<NodeStorageType>* temp_node;
-	while (n != NULL) {
-		//n = n->RandomNext();
-		temp_node = n->RandomNext();
-		//dirty cutoff, needs better solution
-		if (len == 60)
-			break;
-		if (len > maxSetting) {
-			//std::cout<<"MAX ->"<< "node*: " << temp_node << ", len: " << len << "\n";
+	while (true) {
+		temp_node = n->RandomNext(randomEngine);
+		if (len >= maxSetting) {
 			break;
 		}
-
-		if ((temp_node == NULL) && (len < minSetting)) {
-			//std::cout << "node*: " << temp_node << ", len: " << len << "\n";
+		else if ((temp_node == NULL) && (len < minSetting)) {
 			continue;
 		}
 
-		if (temp_node == NULL)
+		else if (temp_node == NULL){
 			break;
+		}
+
 		n = temp_node;
 
-		//std::cout << n->NodeValue();
-		ret[len++] = n->NodeValue();
-
-		//maximum character length exceeded and stack will overflow.
-		//assert(len<32 && "return buffer overflowing, this will segfault if not aborted.");
+		buffer[len++] = n->NodeValue();
 	}
 
 	//null terminate the string
-	ret[len] = 0x00;
+	buffer[len] = 0x00;
 
 	//do something with the generated string
-	return ret; //for now
+	return buffer; //for now
 }
 
 template <typename NodeStorageType>