Skip to content

Commit

Permalink
Merge pull request #131 from dod-advana/feature/UOT-145475-finetune-w…
Browse files Browse the repository at this point in the history
…ith-new-data

Feature/uot 145475 finetune with new data
  • Loading branch information
rha930 committed Jun 23, 2022
2 parents 85701e5 + 76e63ff commit 7953db6
Show file tree
Hide file tree
Showing 14 changed files with 361 additions and 144 deletions.
1 change: 1 addition & 0 deletions dev.requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ matplotlib
seaborn
autopep8
black
debugpy
1 change: 1 addition & 0 deletions gamechangerml/api/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ services:
- capabilities: [gpu]
ports:
- "5000:5000"
- "5678:5678"
env_file:
- .env
volumes:
Expand Down
6 changes: 5 additions & 1 deletion gamechangerml/api/fastapi/mlapp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from fastapi import FastAPI
import faulthandler
from fastapi import FastAPI

from gamechangerml.api.fastapi.routers import startup, search, controls
from gamechangerml.debug.debug_connector import debug_if_flagged

# start debugger if flagged
debug_if_flagged()

# start API
app = FastAPI()
Expand Down
2 changes: 1 addition & 1 deletion gamechangerml/api/fastapi/routers/controls.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,7 @@ async def get_user_data(data_dict: dict, response: Response):
searchData = data_dict["params"]["searchData"]
df = pd.DataFrame(searchData)
GC_SEARCH_DATA = os.path.join(
DATA_PATH, "user_data", "search_history","SearchPdfMapping.csv"
DATA_PATH, "user_data", "search_history", "SearchPdfMapping.csv"
)
df.to_csv(GC_SEARCH_DATA)

Expand Down
26 changes: 18 additions & 8 deletions gamechangerml/api/utils/threaddriver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,32 @@
import json
import sys
from gamechangerml.api.utils.logger import logger
from gamechangerml.api.utils import processmanager
from gamechangerml.debug.debug_connector import check_debug_flagged

# A class that takes in a function and a dictionary of arguments.
# The keys in args have to match the parameters in the function.


class MlThread(threading.Thread):
def __init__(self, function, args = {}):
def __init__(self, function, args={}):
super(MlThread, self).__init__()
self.function = function
self.args = args
self.killed = False

def run(self):
try:
sys.settrace(self.globaltrace)
if check_debug_flagged():
logger.info(
"Debugger from debugpy package is not compatible with sys.settrace, so globaltrace not activated for MlThread")
else:
sys.settrace(self.globaltrace)

self.function(**self.args)
except Exception as e:
logger.error(e)
logger.info("Thread errored out attempting " + self.function.__name__ + " with parameters: " + json.dumps(self.args))
logger.info("Thread errored out attempting " + self.function.__name__ +
" with parameters: " + json.dumps(self.args))

def globaltrace(self, frame, why, arg):
if why == 'call':
Expand All @@ -40,16 +48,18 @@ def kill(self):
# Pass in a function and args which is an array of dicts
# A way to load mulitple jobs and run them on threads.
# join is set to false unless we need to collect the results immediately.
def run_threads(function_list, args_list = [], join = False):


def run_threads(function_list, args_list=[], join=False):
threads = []
for i, function in enumerate(function_list):
args = {}
if i < len(args_list):
args = args_list[i]
thread = MlThread(function, args)
thread = MlThread(function, args)
threads.append(thread)
thread.start()
# If we join the threads the function will wait until they have all completed.
if join:
for thread in threads:
thread.join()
thread.join()
32 changes: 32 additions & 0 deletions gamechangerml/debug/debug_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
from gamechangerml.api.fastapi.settings import logger

env_flag = "ENABLE_DEBUGGER"


def check_debug_flagged():
flag_str = os.getenv(env_flag, "false")
return flag_str == 'true'


def debug_if_flagged():

if check_debug_flagged():
try:
import debugpy
debugger_port = 5678
debugpy.listen(('0.0.0.0', debugger_port))
logger.info(f"\n Debugger listening on {debugger_port} 🥾🦟 \n")

# debugpy.wait_for_client()
# debugpy.breakpoint()
except Exception as e:
import time
logger.warning("ERROR STARTING DEBUGGER CONNECTION")
time.sleep(3)
logger.warning(e)
time.sleep(3)
logger.info(
f"Debugging can be turned off by removing env variable {env_flag}")
else:
logger.info("ENABLE_DEBUGGER not set, debugger not started")
63 changes: 63 additions & 0 deletions gamechangerml/debug/vscode_debug.README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Debug setup for Visual Studio Code

If you already have a launch.json, merge the below config, otherwise:

go to `Run and Debug` tab on sidebar

click link text `create a launch.json file`

select: `Python`

select: `Remote Attach`

Enter the host name... : `localhost`

Enter the port number... : `5678`

your launch.json should look like below
```
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Remote Attach",
"type": "python",
"request": "attach",
"connect": {
"host": "localhost",
"port": 5678
},
"pathMappings": [
{
"localRoot": "${workspaceFolder}",
"remoteRoot": "."
}
],
"justMyCode": true
}
]
}
```

## How it's working

When you use

`docker-compose up` - `gamechanger-ml/gamechangerml/api/docker-compose.yml`

This exposes port `5678` for `gamechanger-ml-gpu`

<br>

In the entrypoint `gamechanger-ml/gamechangerml/api/fastapi/mlapp.py`

`debug_if_flagged()` is called immediately, which is from `gamechanger-ml/gamechangerml/debug/debug_connector.py`

This starts up `debugpy` to listen on `5678` if the ENV variable `ENABLE_DEBUGGER` in `setup_env.sh` is set to `true`

The vscode debugger will attach to it using `launch.json` config

Now you're ready to crush bugs 🥾🦟
91 changes: 48 additions & 43 deletions gamechangerml/scripts/update_eval_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from gamechangerml.src.model_testing.validation_data import IntelSearchData
from gamechangerml.configs.config import ValidationConfig
from gamechangerml.src.utilities.test_utils import (
make_timestamp_directory, check_directory, CustomJSONizer
)
make_timestamp_directory, check_directory, NumpyJSONEncoder
)
from gamechangerml import DATA_PATH
from gamechangerml.api.utils.pathselect import get_model_paths
import logging
Expand All @@ -17,27 +17,29 @@


def make_tiered_eval_data(index_path, testing_only):

if not index_path:
index_path = SENT_INDEX

if not os.path.exists(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")):
os.mkdir(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer"))

sub_dir = os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")

os.mkdir(os.path.join(DATA_PATH, "validation",
"domain", "sent_transformer"))

sub_dir = os.path.join(DATA_PATH, "validation",
"domain", "sent_transformer")

save_dir = make_timestamp_directory(sub_dir)

def save_data(
level: str,
min_correct_matches: int,
max_results: int,
start_date: str,
end_date: str,
exclude_searches: List[str],
filter_queries: bool,
testing_only: bool,
save_dir: Union[str,os.PathLike]=save_dir) -> Tuple[Dict[str,str], Dict[str,str], Dict[str,str]]:
level: str,
min_correct_matches: int,
max_results: int,
start_date: str,
end_date: str,
exclude_searches: List[str],
filter_queries: bool,
testing_only: bool,
save_dir: Union[str, os.PathLike] = save_dir) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
"""Makes eval data for each tier level using args from config.py and saves to save_dir
Args:
level [str]: tier level (['any', 'silver', 'gold'])
Expand All @@ -54,19 +56,19 @@ def save_data(
max_res = max_results[level]

intel = IntelSearchData(
start_date=start_date,
end_date=end_date,
exclude_searches=exclude_searches,
min_correct_matches=min_matches,
max_results=max_res,
filter_queries=filter_queries,
index_path=index_path,
testing_only=testing_only
)
start_date=start_date,
end_date=end_date,
exclude_searches=exclude_searches,
min_correct_matches=min_matches,
max_results=max_res,
filter_queries=filter_queries,
index_path=index_path,
testing_only=testing_only
)

save_intel = {
"queries": intel.queries,
"collection": intel.collection,
"queries": intel.queries,
"collection": intel.collection,
"meta_relations": intel.all_relations,
"correct": intel.correct,
"incorrect": intel.incorrect,
Expand All @@ -89,45 +91,48 @@ def save_data(
"filter_queries": str(filter_queries)
}

save_intel = json.dumps(save_intel, cls=CustomJSONizer)
save_intel = json.dumps(save_intel, cls=NumpyJSONEncoder)
intel_path = check_directory(os.path.join(save_dir, level))
intel_file = os.path.join(intel_path, 'intelligent_search_data.json')
metafile = os.path.join(intel_path, 'intelligent_search_metadata.json')
metafile = os.path.join(intel_path, 'intelligent_search_metadata.json')
with open(intel_file, "w") as outfile:
json.dump(save_intel, outfile)

with open(metafile, "w") as outfile:
json.dump(metadata, outfile)
logger.info(f"***Saved intelligent search validation data to: {intel_path}")
logger.info(
f"***Saved intelligent search validation data to: {intel_path}")

return metadata

all_data = save_data(
level='any',
filter_queries = False,
testing_only = testing_only,
filter_queries=False,
testing_only=testing_only,
**ValidationConfig.TRAINING_ARGS
)
)

silver_data = save_data(
level='silver',
filter_queries = False,
filter_queries=False,
testing_only=testing_only,
**ValidationConfig.TRAINING_ARGS
)
)

gold_data = save_data(
level='gold',
filter_queries = False, # should use same (updated) exclude list of queries as silver_data
# should use same (updated) exclude list of queries as silver_data
filter_queries=False,
testing_only=testing_only,
**ValidationConfig.TRAINING_ARGS
)
)

return all_data, silver_data, gold_data


if __name__ == '__main__':

try:
make_tiered_eval_data(index_path=None, testing_only=False)
except Exception as e:
logger.warning(e, exc_info=True)
logger.warning(e, exc_info=True)
3 changes: 2 additions & 1 deletion gamechangerml/setup_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ function setup_dev() {
export GC_ENABLE_SSL="${GC_ENABLE_SSL:-false}"

export ML_WEB_TOKEN="${ML_WEB_TOKEN:-eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJwZXJtcyI6WyJHYW1lY2hhbmdlciBBZG1pbiJdLCJjbiI6IjAwNyIsImNzcmYtdG9rZW4iOiI0ZWE1YzUwOGE2NTY2ZTc2MjQwNTQzZjhmZWIwNmZkNDU3Nzc3YmUzOTU0OWM0MDE2NDM2YWZkYTY1ZDIzMzBlIiwiaWF0IjoxNjQ0MzQyMjI0fQ.ezxT-36a25IMFJPea3re7sUwYZfm8ivZvDVAv_zti1W6DRkM2Hs7WwFuy-gR092m-p7Z7ohb7yM2AKVbJAn4CMtI3_1j0_VzzYb6yhQj7YIPg-Cax5Em9JCrIlCEx7r26o-zV1me0mIYMATJTMInKikeBvg2RJErvLwkgZNQVT8gQyR-JxM4mhjmylcel9-kt5FpJJuUKnzPI8BqVeG_eL6ktevA9odJRx56w9n2LivUaoQUCiXreLOLmSEwkkhIFnsyMcCCwkPpx4lMrkzjIr3B08_gRk5wIv4pV01OcSYR4QkXM7ZsNZZzRf-JtPHYn9SlT9DvwRVbYniYUCA7IM0OegFKSTt_-i7qvuKuYFHGDStfkijX2T6g_49oY1qfLsKldisccoOLfsaROpB1NiE9DBeM5OzAo-R98H_UiUFjsFVNvlunETbhuqX2yZFUjKxxerS_-1_DW8BmoD25Ofl188KM8gqUXo5lJs4bPTf41_N_V-57muZxdAq8kBazDKhaudAzskFNFF1B9dxwgxeE8wd5Gh_beCuCoP3K-9GwRVFfrdOCO19FDjqpLr0k94UfZzuflP5SDGXth2-AzZIslurPDL_1F4iadxq06GJggwryMxC7_Uc4_FQST53-gl9SXEFVSNdr6gsw318JNiyz8bxbBpIj7posqQeEaDg}"

export ENABLE_DEBUGGER="${ENABLE_DEBUGGER:-true}"
}


Expand All @@ -97,6 +97,7 @@ function setup_devlocal() {
export ES_ENABLE_AUTH="${ES_ENABLE_AUTH:-false}"

export DEV_ENV="DEVLOCAL"
export ENABLE_DEBUGGER="${ENABLE_DEBUGGER:-true}"
}

function setup_k8s_dev() {
Expand Down
Loading

0 comments on commit 7953db6

Please sign in to comment.