diff --git a/README.md b/README.md new file mode 100644 index 0000000..94251ad --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +## WordList Cleaner v.0.2.3 + +Remove non-printable words, trim words length, search duplicates, sorting, words counting. + +![Alt text](/screenshot.jpg?raw=true "Usage") + +Sequence of the option keys is not critical. + +05.09.2015 - Firts commit. + +06.09.2015 - Added automatic processing of all files in a directory by extension. + +07.09.2015 - Added lines calculator. Fixed some errors. + +12.09.2015 - All algorithms has been rewritten to consume less memory. Usage menu and option keys has been changed too. + +Examples: +``` +wordlistcleaner.exe -min 8 -max 10 -src Source.dic -new New.dic remove trim +wordlistcleaner.exe -src Source.dic -new New.dic trim +wordlistcleaner.exe -a -ext txt duplicate +wordlistcleaner.exe -a sort +wordlistcleaner.exe -a calculate +``` \ No newline at end of file diff --git a/main.go b/main.go new file mode 100644 index 0000000..d215ed8 --- /dev/null +++ b/main.go @@ -0,0 +1,153 @@ +package main + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "time" + + o "./operations" + s "./service" +) + +// Do job +func DoJob(remove, trim, duplicate, sorting, calculate bool, min, max int, src_file, new_file string) error { + + // Check operations + if !remove && !trim && !duplicate && !sorting && !calculate { + return errors.New("Not specified operations.") + } + + // Cleaning + if (remove || trim) && (!duplicate && !sorting && !calculate) { + if err := o.DoClean(remove, trim, min, max, src_file, new_file); err != nil { + return err + } + return nil + } + + //Duplicate search + if duplicate && (!remove && !trim && !sorting && !calculate) { + if err := o.DoDuplicate(src_file, new_file); err != nil { + return err + } + return nil + } + + // Sorting + if sorting && (!remove && !trim && !duplicate && !calculate) { + if err := o.DoSorting(src_file, new_file); err != nil { + return err + } + return nil + } + + // calculate + if calculate && (!remove && !trim && !duplicate && !sorting) { + if err := o.DoCalculate(src_file); err != nil { + return err + } + return nil + } + + return errors.New("Incorrect combination of operations.") + +} + +// Split file by name and extension +func SplitFileName(file string) (string, string) { + ext := filepath.Ext(file) + name := file[0 : len(file)-len(ext)] + return name, ext +} + +func main() { + + // variables + var remove bool = false + var trim bool = false + var duplicate bool = false + var sorting bool = false + var calculate bool = false + var min int = 8 + var max int = 63 + var src_file string = "Dict.dic" + var new_file string = "Dict_cleaned.dic" + var auto bool = false + var file_ext = ".dic" + var version = "0.2.3" + + // args + for k, arg := range os.Args { + switch arg { + case "-h": + s.Usage() + return + case "-v": + fmt.Println(version) + return + case "remove": + remove = true + case "trim": + trim = true + case "duplicate": + duplicate = true + case "sort": + sorting = true + case "calculate": + calculate = true + case "-min": + err := s.CheckArgs(len(os.Args), k) + s.CheckError(err) + i, err := strconv.Atoi(os.Args[k+1]) + s.CheckError(err) + min = i + case "-max": + err := s.CheckArgs(len(os.Args), k) + s.CheckError(err) + i, err := strconv.Atoi(os.Args[k+1]) + s.CheckError(err) + max = i + case "-src": + err := s.CheckArgs(len(os.Args), k) + s.CheckError(err) + src_file = os.Args[k+1] + case "-new": + err := s.CheckArgs(len(os.Args), k) + s.CheckError(err) + new_file = os.Args[k+1] + case "-a": + auto = true + case "-ext": + err := s.CheckArgs(len(os.Args), k) + s.CheckError(err) + file_ext = "." + os.Args[k+1] + } + } + + // start time + start := time.Now() + + if auto { + files_list, err := s.SearchFilesInDir(file_ext, "./") + s.CheckError(err) + fmt.Println() + fmt.Println(len(files_list), "files found.") + fmt.Println() + for _, src_file := range files_list { + name, ext := SplitFileName(src_file) + new_file := name + "_cleaned" + ext + err = DoJob(remove, trim, duplicate, sorting, calculate, min, max, src_file, new_file) + s.CheckError(err) + } + } else { + err := DoJob(remove, trim, duplicate, sorting, calculate, min, max, src_file, new_file) + s.CheckError(err) + } + + // elapsed time + elapsed := time.Since(start) + fmt.Println("\nElapsed time: ", elapsed) +} diff --git a/operations/calculate.go b/operations/calculate.go new file mode 100644 index 0000000..d76aefb --- /dev/null +++ b/operations/calculate.go @@ -0,0 +1,20 @@ +package operations + +import ( + "fmt" + + s "../service" +) + +// Calculate lines in source file +func DoCalculate(src_file string) error { + + total, err := s.CalculateLines(src_file) + if err != nil { + return err + } + + fmt.Printf("|%-40s|%20d|\n", src_file, total) + + return nil +} diff --git a/operations/duplicate_search.go b/operations/duplicate_search.go new file mode 100644 index 0000000..c9d9a48 --- /dev/null +++ b/operations/duplicate_search.go @@ -0,0 +1,68 @@ +package operations + +import ( + "bufio" + "fmt" + "os" + + s "../service" +) + +// Search duplicates in source file and write uniq to new file +func DoDuplicate(src_file, new_file string) error { + + m := map[string]bool{} + counter := 0 + percent := 0 + added := 0 + total, err := s.CalculateLines(src_file) + if err != nil { + return err + } + + in, err := os.Open(src_file) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Create(new_file) + if err != nil { + return err + } + defer out.Close() + + scanner := bufio.NewScanner(in) + writer := bufio.NewWriter(out) + + fmt.Printf("\n%s processing: ", src_file) + + for scanner.Scan() { + line := scanner.Text() + + if _, seen := m[line]; !seen { + fmt.Fprintln(writer, line) + m[line] = true + added++ + } + + counter++ + if counter == 100000 { + percent += counter + fmt.Printf("..%d%%", (percent * 100 / total)) + counter = 0 + } + } + + if err := writer.Flush(); err != nil { + return err + } + + fmt.Println() + fmt.Println("Duplicate search result") + fmt.Printf("|%-20s|%20d|\n", "Total", total) + fmt.Printf("|%-20s|%20d|\n", "Removed", (total - added)) + fmt.Printf("|%-20s|%20d|\n", "Result", added) + + return scanner.Err() +} diff --git a/operations/remove_trim.go b/operations/remove_trim.go new file mode 100644 index 0000000..379de38 --- /dev/null +++ b/operations/remove_trim.go @@ -0,0 +1,97 @@ +package operations + +import ( + "bufio" + "fmt" + "os" + + s "../service" +) + +func IsPrint(text string) bool { + for _, r := range text { + if r < 32 || r > 126 { + return false + } + } + return true +} + +func IsSize(min, max int, line string) bool { + if len([]rune(line)) < min || len([]rune(line)) > max { + return false + } + return true +} + +func DoClean(remove, trim bool, min, max int, src_file, new_file string) error { + + counter := 0 + percent := 0 + added := 0 + total, err := s.CalculateLines(src_file) + if err != nil { + return err + } + + in, err := os.Open(src_file) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Create(new_file) + if err != nil { + return err + } + defer out.Close() + + scanner := bufio.NewScanner(in) + writer := bufio.NewWriter(out) + + fmt.Printf("\n%s processing: ", src_file) + + for scanner.Scan() { + line := scanner.Text() + + if remove && trim { + if IsPrint(line) && IsSize(min, max, line) { + fmt.Fprintln(writer, line) + added++ + } + } + + if remove && !trim { + if IsPrint(line) { + fmt.Fprintln(writer, line) + added++ + } + } + + if !remove && trim { + if IsSize(min, max, line) { + fmt.Fprintln(writer, line) + added++ + } + } + + counter++ + if counter == 100000 { + percent += counter + fmt.Printf("..%d%%", (percent * 100 / total)) + counter = 0 + } + } + + if err := writer.Flush(); err != nil { + return err + } + + fmt.Println() + fmt.Println("Cleaning result") + fmt.Printf("|%-20s|%20d|\n", "Total", total) + fmt.Printf("|%-20s|%20d|\n", "Removed", (total - added)) + fmt.Printf("|%-20s|%20d|\n", "Result", added) + + return scanner.Err() +} diff --git a/operations/sorting.go b/operations/sorting.go new file mode 100644 index 0000000..ab3c3ec --- /dev/null +++ b/operations/sorting.go @@ -0,0 +1,44 @@ +package operations + +import ( + "fmt" + "sort" + + s "../service" +) + +// Read source file, sort it alphabetically and write to new file +func DoSorting(src_file, new_file string) error { + + total, err := s.CalculateLines(src_file) + if err != nil { + return err + } + + // read file + fmt.Println("\nReading", src_file) + source, err := s.ReadLine(src_file) + if err != nil { + return err + } + + // sorting + fmt.Println("Sorting", new_file) + sort.Strings(source) + fmt.Println(new_file, "sorted.") + + // write file + fmt.Println("Saving", new_file) + err = s.WriteLine(source, new_file) + if err != nil { + return err + } + + fmt.Println(new_file, "saved.") + + fmt.Println() + fmt.Println("Sorting result") + fmt.Printf("|%-20s|%20d|\n", "Total", total) + + return nil +} diff --git a/screenshot.jpg b/screenshot.jpg new file mode 100644 index 0000000..e4122ed Binary files /dev/null and b/screenshot.jpg differ diff --git a/service/calculate_lines.go b/service/calculate_lines.go new file mode 100644 index 0000000..3b13e85 --- /dev/null +++ b/service/calculate_lines.go @@ -0,0 +1,33 @@ +package service + +import ( + "bytes" + "io" + "os" +) + +func CalculateLines(path string) (int, error) { + file, err := os.Open(path) + if err != nil { + return 0, err + } + defer file.Close() + + buf := make([]byte, 8192) + count := 0 + lineSep := []byte{'\n'} + + for { + c, err := file.Read(buf) + if err != nil && err != io.EOF { + return count, err + } + + count += bytes.Count(buf[:c], lineSep) + + if err == io.EOF { + break + } + } + return count, nil +} diff --git a/service/check_all.go b/service/check_all.go new file mode 100644 index 0000000..2217398 --- /dev/null +++ b/service/check_all.go @@ -0,0 +1,38 @@ +package service + +import ( + "errors" + "fmt" + "os" +) + +// Main check error +func CheckError(err error) { + if err != nil { + Usage() + fmt.Println() + fmt.Println("Error:", err) + os.Exit(-1) + } +} + +// Check is file or directory exist +func CheckFile(file string) error { + f, err := os.Stat(file) + if err == nil { + if f.IsDir() { + return errors.New("Directory with name " + file + " is exists.") + } else { + return errors.New("File with name " + file + " is exists.") + } + } + return nil +} + +// Check key value +func CheckArgs(args_length, arg_index int) error { + if args_length == (arg_index + 1) { + return errors.New("Not specified key value.") + } + return nil +} diff --git a/service/readwrite_file.go b/service/readwrite_file.go new file mode 100644 index 0000000..1d05b6f --- /dev/null +++ b/service/readwrite_file.go @@ -0,0 +1,54 @@ +package service + +import ( + "bufio" + "fmt" + "os" +) + +// Append lines to file +func AppendLine(line, path string) error { + file, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) + if err != nil { + return err + } + defer file.Close() + + if _, err = file.WriteString(line); err != nil { + return err + } + return nil +} + +// Read file line by line +func ReadLine(path string) ([]string, error) { + file, err := os.Open(path) + if err != nil { + return nil, err + } + defer file.Close() + + var lines []string + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + lines = append(lines, line) + } + return lines, scanner.Err() +} + +// Write file line by line +func WriteLine(lines []string, path string) error { + file, err := os.Create(path) + if err != nil { + return err + } + defer file.Close() + + w := bufio.NewWriter(file) + for _, line := range lines { + fmt.Fprintln(w, line) + } + return w.Flush() +} diff --git a/service/search_files.go b/service/search_files.go new file mode 100644 index 0000000..3302fe4 --- /dev/null +++ b/service/search_files.go @@ -0,0 +1,29 @@ +package service + +import ( + "errors" + "io/ioutil" + "path/filepath" +) + +// Search files in directory by extension +func SearchFilesInDir(file_ext, path string) ([]string, error) { + var files_list []string + + files, err := ioutil.ReadDir(path) + if err != nil { + return nil, err + } + + for _, f := range files { + if filepath.Ext(f.Name()) == file_ext { + files_list = append(files_list, f.Name()) + } + } + + if len(files_list) <= 0 { + return nil, errors.New("No files found.") + } + + return files_list, nil +} diff --git a/service/usage.go b/service/usage.go new file mode 100644 index 0000000..83e0c4a --- /dev/null +++ b/service/usage.go @@ -0,0 +1,39 @@ +package service + +import ( + "fmt" + "os" + "path/filepath" +) + +// Usage menu +func Usage() { + a := filepath.Base(os.Args[0]) + fmt.Println() + fmt.Println("Usage:", a, "[MODE] [OPERATIONS] [OPTIONS]") + fmt.Println() + fmt.Println(" WordList cleaner.") + fmt.Println() + fmt.Println(" Remove non-printable words, trim words length, search duplicates,") + fmt.Println(" sorting, words counting.") + fmt.Println() + fmt.Println("Operations:") + fmt.Println(" remove Remove non-printable words.") + fmt.Println(" trim Trim file by size.") + fmt.Println(" duplicate Search duplicates.") + fmt.Println(" sort Sorting.") + fmt.Println(" calculate Calculate lines in the specified file.") + fmt.Println() + fmt.Println("Options:") + fmt.Println(" -min INT Minimal word length. [8]") + fmt.Println(" -max INT Maximum word length. [63]") + fmt.Println(" -src STR Source wordlist file. [Dict.dic]") + fmt.Println(" -new STR New wordlist file. [Dict_cleaned.dic]") + fmt.Println() + fmt.Println("Mode:") + fmt.Println(" -a Automatic processing of all files in a directory. [false]") + fmt.Println(" -ext STR File extension. Only for automatic mode. [dic]") + fmt.Println() + fmt.Println(" -h This help.") + fmt.Println(" -v Print version.") +}