Merge pull request #131 from dod-advana/feature/UOT-145475-finetune-w…

…ith-new-data Feature/uot 145475 finetune with new data
dod-advana · Jun 23, 2022 · 7953db6 · 7953db6
2 parents 85701e5 + 76e63ff
commit 7953db6
Show file tree

Hide file tree

Showing 14 changed files with 361 additions and 144 deletions.
diff --git a/dev.requirements.txt b/dev.requirements.txt
@@ -10,3 +10,4 @@ matplotlib
 seaborn
 autopep8
 black
+debugpy
diff --git a/gamechangerml/api/docker-compose.yml b/gamechangerml/api/docker-compose.yml
@@ -26,6 +26,7 @@ services:
           - capabilities: [gpu]
     ports:
       - "5000:5000"
+      - "5678:5678"
     env_file:
       - .env
     volumes:

diff --git a/gamechangerml/api/fastapi/mlapp.py b/gamechangerml/api/fastapi/mlapp.py
@@ -1,7 +1,11 @@
-from fastapi import FastAPI
 import faulthandler
+from fastapi import FastAPI
 
 from gamechangerml.api.fastapi.routers import startup, search, controls
+from gamechangerml.debug.debug_connector import debug_if_flagged
+
+# start debugger if flagged
+debug_if_flagged()
 
 # start API
 app = FastAPI()

diff --git a/gamechangerml/api/fastapi/routers/controls.py b/gamechangerml/api/fastapi/routers/controls.py
@@ -1030,7 +1030,7 @@ async def get_user_data(data_dict: dict, response: Response):
     searchData = data_dict["params"]["searchData"]
     df = pd.DataFrame(searchData)
     GC_SEARCH_DATA = os.path.join(
-      DATA_PATH, "user_data", "search_history","SearchPdfMapping.csv"
+        DATA_PATH, "user_data", "search_history", "SearchPdfMapping.csv"
     )
     df.to_csv(GC_SEARCH_DATA)
 

diff --git a/gamechangerml/api/utils/threaddriver.py b/gamechangerml/api/utils/threaddriver.py
@@ -2,24 +2,32 @@
 import json
 import sys
 from gamechangerml.api.utils.logger import logger
-from gamechangerml.api.utils import processmanager
+from gamechangerml.debug.debug_connector import check_debug_flagged
 
 # A class that takes in a function and a dictionary of arguments.
 # The keys in args have to match the parameters in the function.
+
+
 class MlThread(threading.Thread):
-    def __init__(self, function, args = {}):
+    def __init__(self, function, args={}):
         super(MlThread, self).__init__()
         self.function = function
         self.args = args
         self.killed = False
-   
+
     def run(self):
         try:
-            sys.settrace(self.globaltrace)
+            if check_debug_flagged():
+                logger.info(
+                    "Debugger from debugpy package is not compatible with sys.settrace, so globaltrace not activated for MlThread")
+            else:
+                sys.settrace(self.globaltrace)
+
             self.function(**self.args)
         except Exception as e:
             logger.error(e)
-            logger.info("Thread errored out attempting " + self.function.__name__ + " with parameters: " + json.dumps(self.args))
+            logger.info("Thread errored out attempting " + self.function.__name__ +
+                        " with parameters: " + json.dumps(self.args))
 
     def globaltrace(self, frame, why, arg):
         if why == 'call':
@@ -40,16 +48,18 @@ def kill(self):
 # Pass in a function and args which is an array of dicts
 # A way to load mulitple jobs and run them on threads.
 # join is set to false unless we need to collect the results immediately.
-def run_threads(function_list, args_list = [], join = False):
+
+
+def run_threads(function_list, args_list=[], join=False):
     threads = []
     for i, function in enumerate(function_list):
         args = {}
         if i < len(args_list):
             args = args_list[i]
-        thread = MlThread(function, args)  
+        thread = MlThread(function, args)
         threads.append(thread)
         thread.start()
     # If we join the threads the function will wait until they have all completed.
     if join:
         for thread in threads:
-            thread.join()
+            thread.join()
diff --git a/gamechangerml/debug/debug_connector.py b/gamechangerml/debug/debug_connector.py
@@ -0,0 +1,32 @@
+import os
+from gamechangerml.api.fastapi.settings import logger
+
+env_flag = "ENABLE_DEBUGGER"
+
+
+def check_debug_flagged():
+    flag_str = os.getenv(env_flag, "false")
+    return flag_str == 'true'
+
+
+def debug_if_flagged():
+
+    if check_debug_flagged():
+        try:
+            import debugpy
+            debugger_port = 5678
+            debugpy.listen(('0.0.0.0', debugger_port))
+            logger.info(f"\n Debugger listening on {debugger_port}  🥾🦟 \n")
+
+            # debugpy.wait_for_client()
+            # debugpy.breakpoint()
+        except Exception as e:
+            import time
+            logger.warning("ERROR STARTING DEBUGGER CONNECTION")
+            time.sleep(3)
+            logger.warning(e)
+            time.sleep(3)
+            logger.info(
+                f"Debugging can be turned off by removing env variable {env_flag}")
+    else:
+        logger.info("ENABLE_DEBUGGER not set, debugger not started")
diff --git a/gamechangerml/debug/vscode_debug.README.md b/gamechangerml/debug/vscode_debug.README.md
@@ -0,0 +1,63 @@
+# Debug setup for Visual Studio Code
+
+If you already have a launch.json, merge the below config, otherwise:
+
+go to `Run and Debug` tab on sidebar
+
+click link text `create a launch.json file` 
+
+select: `Python`
+
+select: `Remote Attach`
+
+Enter the host name... : `localhost`
+
+Enter the port number... : `5678`
+
+your launch.json should look like below
+```
+{
+	// Use IntelliSense to learn about possible attributes.
+	// Hover to view descriptions of existing attributes.
+	// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+	"version": "0.2.0",
+	"configurations": [
+		{
+			"name": "Python: Remote Attach",
+			"type": "python",
+			"request": "attach",
+			"connect": {
+				"host": "localhost",
+				"port": 5678
+			},
+			"pathMappings": [
+				{
+					"localRoot": "${workspaceFolder}",
+					"remoteRoot": "."
+				}
+			],
+			"justMyCode": true
+		}
+	]
+}
+```
+
+## How it's working
+
+When you use
+
+`docker-compose up` - `gamechanger-ml/gamechangerml/api/docker-compose.yml`
+
+This exposes port `5678` for `gamechanger-ml-gpu`
+
+<br>
+
+In the entrypoint `gamechanger-ml/gamechangerml/api/fastapi/mlapp.py`
+
+`debug_if_flagged()` is called immediately, which is from `gamechanger-ml/gamechangerml/debug/debug_connector.py`
+
+This starts up `debugpy` to listen on `5678` if the ENV variable `ENABLE_DEBUGGER` in `setup_env.sh` is set to `true`
+
+The vscode debugger will attach to it using `launch.json` config
+
+Now you're ready to crush bugs 🥾🦟
diff --git a/gamechangerml/scripts/update_eval_data.py b/gamechangerml/scripts/update_eval_data.py
@@ -5,8 +5,8 @@
 from gamechangerml.src.model_testing.validation_data import IntelSearchData
 from gamechangerml.configs.config import ValidationConfig
 from gamechangerml.src.utilities.test_utils import (
-    make_timestamp_directory, check_directory, CustomJSONizer
-    )
+    make_timestamp_directory, check_directory, NumpyJSONEncoder
+)
 from gamechangerml import DATA_PATH
 from gamechangerml.api.utils.pathselect import get_model_paths
 import logging
@@ -17,27 +17,29 @@
 
 
 def make_tiered_eval_data(index_path, testing_only):
- 
+
     if not index_path:
         index_path = SENT_INDEX
 
     if not os.path.exists(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")):
-        os.mkdir(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer"))
-
-    sub_dir = os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")
-
+        os.mkdir(os.path.join(DATA_PATH, "validation",
+                 "domain", "sent_transformer"))
+
+    sub_dir = os.path.join(DATA_PATH, "validation",
+                           "domain", "sent_transformer")
+
     save_dir = make_timestamp_directory(sub_dir)
 
     def save_data(
-        level: str, 
-        min_correct_matches: int, 
-        max_results: int, 
-        start_date: str, 
-        end_date: str, 
-        exclude_searches: List[str], 
-        filter_queries: bool,
-        testing_only: bool,
-        save_dir: Union[str,os.PathLike]=save_dir) -> Tuple[Dict[str,str], Dict[str,str], Dict[str,str]]:
+            level: str,
+            min_correct_matches: int,
+            max_results: int,
+            start_date: str,
+            end_date: str,
+            exclude_searches: List[str],
+            filter_queries: bool,
+            testing_only: bool,
+            save_dir: Union[str, os.PathLike] = save_dir) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
         """Makes eval data for each tier level using args from config.py and saves to save_dir
         Args:
             level [str]: tier level (['any', 'silver', 'gold'])
@@ -54,19 +56,19 @@ def save_data(
         max_res = max_results[level]
 
         intel = IntelSearchData(
-                    start_date=start_date,
-                    end_date=end_date,
-                    exclude_searches=exclude_searches,
-                    min_correct_matches=min_matches,
-                    max_results=max_res,
-                    filter_queries=filter_queries,
-                    index_path=index_path,
-                    testing_only=testing_only
-                )
+            start_date=start_date,
+            end_date=end_date,
+            exclude_searches=exclude_searches,
+            min_correct_matches=min_matches,
+            max_results=max_res,
+            filter_queries=filter_queries,
+            index_path=index_path,
+            testing_only=testing_only
+        )
 
         save_intel = {
-            "queries": intel.queries, 
-            "collection": intel.collection, 
+            "queries": intel.queries,
+            "collection": intel.collection,
             "meta_relations": intel.all_relations,
             "correct": intel.correct,
             "incorrect": intel.incorrect,
@@ -89,45 +91,48 @@ def save_data(
             "filter_queries": str(filter_queries)
         }
 
-        save_intel = json.dumps(save_intel, cls=CustomJSONizer)
+        save_intel = json.dumps(save_intel, cls=NumpyJSONEncoder)
         intel_path = check_directory(os.path.join(save_dir, level))
         intel_file = os.path.join(intel_path, 'intelligent_search_data.json')
-        metafile =  os.path.join(intel_path, 'intelligent_search_metadata.json')
+        metafile = os.path.join(intel_path, 'intelligent_search_metadata.json')
         with open(intel_file, "w") as outfile:
             json.dump(save_intel, outfile)
-        
+
         with open(metafile, "w") as outfile:
             json.dump(metadata, outfile)
-        logger.info(f"***Saved intelligent search validation data to: {intel_path}")        
+        logger.info(
+            f"***Saved intelligent search validation data to: {intel_path}")
 
         return metadata
 
     all_data = save_data(
         level='any',
-        filter_queries = False,
-        testing_only = testing_only,
+        filter_queries=False,
+        testing_only=testing_only,
         **ValidationConfig.TRAINING_ARGS
-        )
-    
+    )
+
     silver_data = save_data(
         level='silver',
-        filter_queries = False,
+        filter_queries=False,
         testing_only=testing_only,
         **ValidationConfig.TRAINING_ARGS
-        )
-    
+    )
+
     gold_data = save_data(
         level='gold',
-        filter_queries = False, # should use same (updated) exclude list of queries as silver_data
+        # should use same (updated) exclude list of queries as silver_data
+        filter_queries=False,
         testing_only=testing_only,
         **ValidationConfig.TRAINING_ARGS
-        )
-    
+    )
+
     return all_data, silver_data, gold_data
 
+
 if __name__ == '__main__':
-    
+
     try:
         make_tiered_eval_data(index_path=None, testing_only=False)
     except Exception as e:
-        logger.warning(e, exc_info=True)
+        logger.warning(e, exc_info=True)
diff --git a/gamechangerml/setup_env.sh b/gamechangerml/setup_env.sh
@@ -76,7 +76,7 @@ function setup_dev() {
     export GC_ENABLE_SSL="${GC_ENABLE_SSL:-false}"
 
     export ML_WEB_TOKEN="${ML_WEB_TOKEN:-eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJwZXJtcyI6WyJHYW1lY2hhbmdlciBBZG1pbiJdLCJjbiI6IjAwNyIsImNzcmYtdG9rZW4iOiI0ZWE1YzUwOGE2NTY2ZTc2MjQwNTQzZjhmZWIwNmZkNDU3Nzc3YmUzOTU0OWM0MDE2NDM2YWZkYTY1ZDIzMzBlIiwiaWF0IjoxNjQ0MzQyMjI0fQ.ezxT-36a25IMFJPea3re7sUwYZfm8ivZvDVAv_zti1W6DRkM2Hs7WwFuy-gR092m-p7Z7ohb7yM2AKVbJAn4CMtI3_1j0_VzzYb6yhQj7YIPg-Cax5Em9JCrIlCEx7r26o-zV1me0mIYMATJTMInKikeBvg2RJErvLwkgZNQVT8gQyR-JxM4mhjmylcel9-kt5FpJJuUKnzPI8BqVeG_eL6ktevA9odJRx56w9n2LivUaoQUCiXreLOLmSEwkkhIFnsyMcCCwkPpx4lMrkzjIr3B08_gRk5wIv4pV01OcSYR4QkXM7ZsNZZzRf-JtPHYn9SlT9DvwRVbYniYUCA7IM0OegFKSTt_-i7qvuKuYFHGDStfkijX2T6g_49oY1qfLsKldisccoOLfsaROpB1NiE9DBeM5OzAo-R98H_UiUFjsFVNvlunETbhuqX2yZFUjKxxerS_-1_DW8BmoD25Ofl188KM8gqUXo5lJs4bPTf41_N_V-57muZxdAq8kBazDKhaudAzskFNFF1B9dxwgxeE8wd5Gh_beCuCoP3K-9GwRVFfrdOCO19FDjqpLr0k94UfZzuflP5SDGXth2-AzZIslurPDL_1F4iadxq06GJggwryMxC7_Uc4_FQST53-gl9SXEFVSNdr6gsw318JNiyz8bxbBpIj7posqQeEaDg}"
-
+    export ENABLE_DEBUGGER="${ENABLE_DEBUGGER:-true}"
 }
 
 
@@ -97,6 +97,7 @@ function setup_devlocal() {
   export ES_ENABLE_AUTH="${ES_ENABLE_AUTH:-false}"
 
   export DEV_ENV="DEVLOCAL"
+  export ENABLE_DEBUGGER="${ENABLE_DEBUGGER:-true}"
 }
 
 function setup_k8s_dev() {