diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/run_all_tests.py b/tests/run_all_tests.py new file mode 100644 index 0000000..05ec8eb --- /dev/null +++ b/tests/run_all_tests.py @@ -0,0 +1,22 @@ +import unittest +import sys +import os + +def run_all_tests(): + """Discover and run all tests in the tests directory""" + # Get the directory containing this file + test_dir = os.path.dirname(os.path.abspath(__file__)) + + # Discover and run tests + loader = unittest.TestLoader() + suite = loader.discover(test_dir, pattern='test_*.py') + + # Run tests with verbosity + runner = unittest.TextTestRunner(verbosity=2) + result = runner.run(suite) + + # Return 0 if all tests passed, 1 if any failed + return 0 if result.wasSuccessful() else 1 + +if __name__ == '__main__': + sys.exit(run_all_tests()) \ No newline at end of file diff --git a/tests/test_categories.csv b/tests/test_data/test_categories.csv similarity index 100% rename from tests/test_categories.csv rename to tests/test_data/test_categories.csv diff --git a/tests/test_messages.csv b/tests/test_data/test_messages.csv similarity index 100% rename from tests/test_messages.csv rename to tests/test_data/test_messages.csv diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py deleted file mode 100644 index b0a0958..0000000 --- a/tests/test_data_processing.py +++ /dev/null @@ -1,211 +0,0 @@ -import sys -import unittest -import pandas as pd -import os -from sqlalchemy import create_engine -from data import load_data -from test_train_classifier import chicken_duties -from data.process_data import load_data -from data.process_data import save_data - - -class TestProcessData(unittest.TestCase): - - def setUp(self): - """Set up sample data for testing.""" - # Sample messages and categories data - self.messages_data = { - 'id': [1, 2, 3], - 'message': ['Hello', 'Help', 'Goodbye'], - 'original': ['Hola', 'Ayuda', 'AdiĆ³s'], - 'genre': ['social', 'news', 'direct'] - } - self.categories_data = { - 'id': [1, 2, 3], - 'categories': ['related-1;request-0;offer-0', - 'related-1;request-1;offer-0', - 'related-0;request-0;offer-1'] - } - - # Create DataFrames from sample data - self.messages_df = pd.DataFrame(self.messages_data) - self.categories_df = pd.DataFrame(self.categories_data) - - def test_load_data(self): - """Test loading and merging data.""" - # Create temporary CSV files for messages and categories - self.messages_df.to_csv('test_messages.csv', index=False) - self.categories_df.to_csv('test_categories.csv', index=False) - - # Load and merge data - df = load_data('test_messages.csv', 'test_categories.csv') - - # Test the shape and contents of the combined dataframe - self.assertEqual(df.shape[0], 3) # Check the number of rows - self.assertIn('message', df.columns) # Check for specific column - self.assertIn('categories', df.columns) # Check for specific column - - # Clean up temporary files - os.remove('test_messages.csv') - os.remove('test_categories.csv') - - def test_clean_data(self): - """Test cleaning of data.""" - # Merge the messages and categories first - df = pd.merge(self.messages_df, self.categories_df, on="id") - - # Clean the merged dataframe - cleaned_df = clean_data(df) - - # Test the new columns are binary and correctly converted - self.assertEqual(cleaned_df.shape[0], 3) # Check the number of rows - self.assertIn('related', cleaned_df.columns) # Check for expanded categories - self.assertTrue(all(cleaned_df['related'].isin([0, 1]))) # Ensure binary conversion - - # Test that there are no duplicates - self.assertFalse(cleaned_df.duplicated().any()) - - def test_save_data(self): - """Test saving data to a SQLite database.""" - # Sample cleaned dataframe - cleaned_df = clean_data(pd.merge(self.messages_df, self.categories_df, on="id")) - - # Save to an SQLite database - save_data(cleaned_df, 'test_database.db') - - # Test that the database and table exist - engine = create_engine("sqlite:///test_database.db") - table_names = engine.table_names() - self.assertIn('cleandata', table_names) # Check that table was created - - # Clean up the test database file - os.remove('test_database.db') - -if __name__ == '__main__': - unittest.main() - - -class TestDataProcessing(unittest.TestCase): - - def setUp(self): - # Sample CSV data for testing - self.messages_data = """id,message - 1,Hello World - 2,Machine Learning is fun - 3,Unit testing is important - """ - self.categories_data = """id,categories - 1,related;request;1;0;0 - 2,related;offer;0;1;0 - 3,request;related;0;0;1 - """ - # Create DataFrames - self.messages_filepath = 'messages_test.csv' - self.categories_filepath = 'categories_test.csv' - self.df_messages = pd.read_csv(StringIO(self.messages_data)) - self.df_categories = pd.read_csv(StringIO(self.categories_data)) - - # Save to CSV for testing - self.df_messages.to_csv(self.messages_filepath, index=False) - self.df_categories.to_csv(self.categories_filepath, index=False) - - def test_load_data(self): - """Test loading data from CSV files.""" - df = load_data(self.messages_filepath, self.categories_filepath) - self.assertEqual(df.shape[0], 3) # Check if 3 rows are loaded - self.assertIn('message', df.columns) # Check if 'message' column exists - self.assertIn('categories', df.columns) # Check if 'categories' column exists - - def test_clean_data(self): - """Test cleaning of data.""" - df = load_data(self.messages_filepath, self.categories_filepath) - cleaned_df = clean_data(df) - self.assertIn('related', cleaned_df.columns) # Check if the category 'related' is present - self.assertIn('request', cleaned_df.columns) # Check if the category 'request' is present - self.assertTrue((cleaned_df['related'].isin([0, 1])).all()) # Check binary values in 'related' column - - def test_save_data(self): - """Test saving data to SQLite database.""" - df = load_data(self.messages_filepath, self.categories_filepath) - cleaned_df = clean_data(df) - database_path = 'test_database.db' - - # Use a context manager to avoid leaving the database open - with self.assertRaises(Exception): - save_data(cleaned_df, database_path) - - def tearDown(self): - import os - os.remove(self.messages_filepath) # Remove test CSV files - os.remove(self.categories_filepath) - -if __name__ == '__main__': - unittest.main() - - -print(tokenize('There.')) -print(WordNetLemmatizer().lemmatize('there')) -'there' in set(stopwords.words("english")) - - -def display_dataset(X_train, y_train, X_test=None, y_test=None): - """ - """ - print("unique Y values: ", np.unique(Y)) - print("training set, X: ", X_train.shape) - if X_test is not None: - print("test set, X: ",X_test.shape) - print("training set, Y: ",y_train.shape) - if y_test is not None: - print("test set, Y: ",y_test.shape) - -def data_type_check(X1, X2): - """ - """ - # check data types of - print("X1 shape: ", X1.shape) - print("X2 shape: ", X2.shape) - print("X1 Type: ", type(X1)) - print("X2 Type: ",type(X2)) - - -text = 'What can I do?' -tokens = tokenize(text) -print(tokens) -for token in word_tokenize(text.lower()): - print(WordNetLemmatizer().lemmatize(token)) - print(f'{token}, {token in set(stopwords.words("english"))}') - - - -# print(accuracy(y_test, y_pred)) - -# # Now you can generate the classification report -# for col_index in range(0,y_test.shape[1]): -# report = classification_report(y_test[:,col_index], y_pred[:, col_index], zero_division=0) -# print(classes[col_index]) -# print(report) - - - - - - - -#Testing Tokenize Function - Use this to test the output of the tokenize function. - -text1 = "Barclaysjbki CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference https://www.google.com" -print(f'input text: "{text1}"\n') -print(f"text tokens: {tokenize(text1)} \n") -text2 = "The No. 8 Northeast Gale or storm signal was issued at 5.55pm yesterday (September 14) and was replaced by Southeast gale and storm signal at 12.35am today (September 15)." -print(f'input text: "{text2}" \n') -print(f"text tokens: {tokenize(text2)} \n") -sentence_list = sent_tokenize(text2) -print(f"sentences: {sentence_list} \n") -print("testing sentence tokenization...") -for text in sentence_list: - print(f'\ntext: "{text}"') - print(f"\ntext tokens: {tokenize(text)}") - - - \ No newline at end of file diff --git a/tests/test_process_data.py b/tests/test_process_data.py new file mode 100644 index 0000000..f4a7dda --- /dev/null +++ b/tests/test_process_data.py @@ -0,0 +1,159 @@ +# tests/test_process_data.py +import unittest +import sys +import os +import pandas as pd +from sqlalchemy import create_engine + +# Add the parent directory to the Python path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from data.process_data import load_data, clean_data, save_data + +class TestProcessData(unittest.TestCase): + """Test cases for disaster response data processing functions""" + + @classmethod + def setUpClass(cls): + """Set up test fixtures including sample CSV files""" + # Define paths for test files + cls.test_data_dir = os.path.join(os.path.dirname(__file__), 'test_data') + cls.messages_filepath = os.path.join(cls.test_data_dir, 'test_messages.csv') + cls.categories_filepath = os.path.join(cls.test_data_dir, 'test_categories.csv') + cls.database_filepath = os.path.join(cls.test_data_dir, 'test_db.db') + + # Create test_data directory if it doesn't exist + os.makedirs(cls.test_data_dir, exist_ok=True) + + def setUp(self): + """Create sample CSV files before each test""" + # Create a small sample messages.csv file + messages_data = """id,message,original,genre +1,"Help! We need water","Help! We need water",direct +2,"Need food and shelter","Need food and shelter",direct +3,"No electricity","No electricity",news +4,"Medical assistance needed","Medical assistance needed",news""" + + # Create a small sample categories.csv file + categories_data = """id,categories +1,"related-1;request-1;aid_related-1;medical_help-0;water-1" +2,"related-1;request-1;aid_related-1;medical_help-0;food-1;shelter-1" +3,"related-1;request-0;aid_related-1;medical_help-0;infrastructure_related-1;electricity-1" +4,"related-1;request-1;aid_related-1;medical_help-1;medical_assistance-1" """ + + # Write test CSV files + with open(self.messages_filepath, 'w') as f: + f.write(messages_data) + + with open(self.categories_filepath, 'w') as f: + f.write(categories_data) + + def test_load_data(self): + """Test if load_data correctly loads and merges the datasets""" + df = load_data(self.messages_filepath, self.categories_filepath) + + # Test the loaded data structure + self.assertEqual(len(df), 4) # Should have 4 rows + self.assertTrue(all(col in df.columns + for col in ['id', 'message', 'original', 'genre', 'categories'])) + + # Test data content + self.assertEqual(df.iloc[0]['message'], 'Help! We need water') + self.assertEqual(df.iloc[0]['genre'], 'direct') + + def test_clean_data(self): + """Test if clean_data correctly processes the DataFrame""" + # First load the data + df = load_data(self.messages_filepath, self.categories_filepath) + cleaned_df = clean_data(df) + + # Test binary values in category columns + category_columns = [col for col in cleaned_df.columns + if col not in ['id', 'message', 'original', 'genre']] + + for col in category_columns: + unique_vals = cleaned_df[col].unique() + self.assertTrue(all(val in [0.0, 1.0] for val in unique_vals), + f"Column {col} contains non-binary values: {unique_vals}") + + # Test no duplicates + self.assertEqual(len(cleaned_df), len(cleaned_df.drop_duplicates())) + + # Test expected transformations + self.assertTrue('water' in cleaned_df.columns) + self.assertEqual(cleaned_df.iloc[0]['water'], 1.0) + + def test_save_data(self): + """Test if save_data correctly saves the DataFrame to SQLite database""" + # Load and clean the data + df = load_data(self.messages_filepath, self.categories_filepath) + cleaned_df = clean_data(df) + + # Save to database + save_data(cleaned_df, self.database_filepath) + + # Verify data was saved correctly + engine = create_engine(f'sqlite:///{self.database_filepath}') + saved_df = pd.read_sql_table('cleandata', engine) + + self.assertEqual(len(saved_df), len(cleaned_df)) + self.assertTrue(all(col in saved_df.columns for col in cleaned_df.columns)) + + def tearDown(self): + """Clean up test files after each test""" + # Remove test files + for filepath in [self.messages_filepath, self.categories_filepath, self.database_filepath]: + try: + if os.path.exists(filepath): + os.remove(filepath) + except PermissionError: + pass # Handle Windows file lock issues + + @classmethod + def tearDownClass(cls): + """Clean up test directory after all tests""" + try: + os.rmdir(cls.test_data_dir) + except (OSError, PermissionError): + pass # Directory might not be empty or might be locked + +class TestTextProcessing(unittest.TestCase): + """Test cases for text processing functions""" + + def setUp(self): + """Load sample messages from test CSV""" + messages_filepath = os.path.join( + os.path.dirname(__file__), + 'test_data/test_messages.csv' + ) + if os.path.exists(messages_filepath): + self.test_df = pd.read_csv(messages_filepath) + else: + self.test_df = pd.DataFrame({ + 'message': [ + 'Help! Need water.', + 'We need medical supplies and food immediately!!', + 'Need 100 blankets at shelter 5' + ] + }) + + def test_tokenize(self): + """Test if tokenize correctly processes text""" + from data.process_data import tokenize + + # Test first message + tokens = tokenize(self.test_df['message'].iloc[0]) + self.assertTrue(all(isinstance(token, str) for token in tokens)) + self.assertTrue(len(tokens) > 0) + + def test_tokenize_empty(self): + """Test tokenize with empty input""" + from data.process_data import tokenize + self.assertEqual(tokenize(''), []) + + def tearDown(self): + """Clean up test resources""" + # Clear the test DataFrame + self.test_df = None + +if __name__ == '__main__': + unittest.main() \ No newline at end of file