exp id and tensorboard visualization (LibCity#233)

* exp id and tensorboard * fix_bug * readme
Lucas-lyh · Dec 15, 2021 · 7b0a052 · 7b0a052
1 parent cac90a0
commit 7b0a052
Show file tree

Hide file tree

Showing 14 changed files with 91 additions and 33 deletions.
diff --git a/libcity/evaluator/road_representation_evaluator.py b/libcity/evaluator/road_representation_evaluator.py
@@ -13,11 +13,12 @@ def __init__(self, config):
         self._logger = getLogger()
         self.model = config.get('model', '')
         self.dataset = config.get('dataset', '')
+        self.exp_id = config.get('exp_id', None)
         self.data_path = './raw_data/' + self.dataset + '/'
         self.geo_file = config.get('geo_file', self.dataset)
         self.output_dim = config.get('output_dim', 32)
-        self.embedding_path = './libcity/cache/evaluate_cache/embedding_{}_{}_{}.npy'\
-            .format(self.model, self.dataset, self.output_dim)
+        self.embedding_path = './libcity/cache/{}/evaluate_cache/embedding_{}_{}_{}.npy'\
+            .format(self.exp_id, self.model, self.dataset, self.output_dim)
 
     def collect(self, batch):
         pass
@@ -53,8 +54,8 @@ def evaluate(self):
             if kind not in result_token:
                 result_token[kind] = []
             result_token[kind].append(self.ind_to_geo[i])
-        result_path = './libcity/cache/evaluate_cache/kmeans_category_{}_{}_{}.json'.\
-            format(self.model, str(self.output_dim), str(kinds))
+        result_path = './libcity/cache/{}/evaluate_cache/kmeans_category_{}_{}_{}.json'.\
+            format(self.exp_id, self.model, str(self.output_dim), str(kinds))
         json.dump(result_token, open(result_path, 'w'))
         self._logger.info('Kmeans category is saved at {}'.format(result_path))
 
@@ -78,8 +79,8 @@ def evaluate(self):
         df = pd.DataFrame(df)
         df.columns = ['id', 'rid', 'class', 'wkt']
         df = df.sort_values(by='class')
-        result_path = './libcity/cache/evaluate_cache/kmeans_qgis_{}_{}_{}.csv'.\
-            format(self.model, str(self.output_dim), str(kinds))
+        result_path = './libcity/cache/{}/evaluate_cache/kmeans_qgis_{}_{}_{}.csv'.\
+            format(self.exp_id, self.model, str(self.output_dim), str(kinds))
         df.to_csv(result_path, index=False)
         self._logger.info('Kmeans result for QGIS is saved at {}'.format(result_path))
 

diff --git a/libcity/executor/abstract_tradition_executor.py b/libcity/executor/abstract_tradition_executor.py
@@ -13,9 +13,10 @@ def __init__(self, config, model):
         self.config = config
         self.device = self.config.get('device', torch.device('cpu'))
         self.model = model
+        self.exp_id = self.config.get('exp_id', None)
 
-        self.cache_dir = './libcity/cache/model_cache'
-        self.evaluate_res_dir = './libcity/cache/evaluate_cache'
+        self.cache_dir = './libcity/cache/{}/model_cache'.format(self.exp_id)
+        self.evaluate_res_dir = './libcity/cache/{}/evaluate_cache'.format(self.exp_id)
 
         ensure_dir(self.cache_dir)
         ensure_dir(self.evaluate_res_dir)

diff --git a/libcity/executor/geosan_executor.py b/libcity/executor/geosan_executor.py
@@ -16,8 +16,9 @@ def __init__(self, config, model):
         self.device = self.config.get('device', torch.device('cpu'))
         self.model = model.to(self.device)
         self.evaluator = get_evaluator(config)
-        self.evaluate_res_dir = './libcity/cache/evaluate_cache'
-        self.cache_dir = './libcity/cache/model_cache'
+        self.exp_id = self.config.get('exp_id', None)
+        self.cache_dir = './libcity/cache/{}/model_cache'.format(self.exp_id)
+        self.evaluate_res_dir = './libcity/cache/{}/evaluate_cache'.format(self.exp_id)
         self.tmp_path = './libcity/tmp/checkpoint/'
 
     def train(self, train_dataloader, eval_dataloader):

diff --git a/libcity/executor/map_matching_executor.py b/libcity/executor/map_matching_executor.py
@@ -9,7 +9,9 @@ def __init__(self, config, model):
         self.model = model
         self.config = config
         self.evaluator = get_evaluator(config)
-        self.evaluate_res_dir = './libcity/cache/evaluate_cache'
+        self.exp_id = self.config.get('exp_id', None)
+        self.cache_dir = './libcity/cache/{}/model_cache'.format(self.exp_id)
+        self.evaluate_res_dir = './libcity/cache/{}/evaluate_cache'.format(self.exp_id)
         self._logger = getLogger()
 
     def evaluate(self, test_data):

diff --git a/libcity/executor/traffic_state_executor.py b/libcity/executor/traffic_state_executor.py
@@ -17,10 +17,11 @@ def __init__(self, config, model):
         self.config = config
         self.device = self.config.get('device', torch.device('cpu'))
         self.model = model.to(self.device)
+        self.exp_id = self.config.get('exp_id', None)
 
-        self.cache_dir = './libcity/cache/model_cache'
-        self.evaluate_res_dir = './libcity/cache/evaluate_cache'
-        self.summary_writer_dir = './libcity/log/runs'
+        self.cache_dir = './libcity/cache/{}/model_cache'.format(self.exp_id)
+        self.evaluate_res_dir = './libcity/cache/{}/evaluate_cache'.format(self.exp_id)
+        self.summary_writer_dir = './libcity/cache/{}/'.format(self.exp_id)
         ensure_dir(self.cache_dir)
         ensure_dir(self.evaluate_res_dir)
         ensure_dir(self.summary_writer_dir)

diff --git a/libcity/executor/traj_loc_pred_executor.py b/libcity/executor/traj_loc_pred_executor.py
@@ -18,8 +18,9 @@ def __init__(self, config, model):
         self.config = config
         self.model = model.to(self.config['device'])
         self.tmp_path = './libcity/tmp/checkpoint/'
-        self.cache_dir = './libcity/cache/model_cache'
-        self.evaluate_res_dir = './libcity/cache/evaluate_cache'
+        self.exp_id = self.config.get('exp_id', None)
+        self.cache_dir = './libcity/cache/{}/model_cache'.format(self.exp_id)
+        self.evaluate_res_dir = './libcity/cache/{}/evaluate_cache'.format(self.exp_id)
         self.loss_func = None  # TODO: 根据配置文件支持选择特定的 Loss Func 目前并未实装
         self._logger = getLogger()
         self.optimizer = self._build_optimizer()

diff --git a/libcity/model/road_representation/ChebConv.py b/libcity/model/road_representation/ChebConv.py
@@ -96,6 +96,7 @@ def __init__(self, config, data_feature):
         self.filter_type = config.get('filter_type', 'dual_random_walk')
         self.model = config.get('model', '')
         self.dataset = config.get('dataset', '')
+        self.exp_id = config.get('exp_id', None)
 
         self.encoder = ChebConvModule(num_nodes=self.num_nodes, max_diffusion_step=self.max_diffusion_step,
                                       adj_mx=self.adj_mx, device=self.device, input_dim=self.feature_dim,
@@ -117,8 +118,8 @@ def forward(self, batch):
         """
         inputs = batch['node_features']
         encoder_state = self.encoder(inputs)  # N, output_dim
-        np.save('./libcity/cache/evaluate_cache/embedding_{}_{}_{}.npy'
-                .format(self.model, self.dataset, self.output_dim),
+        np.save('./libcity/cache/{}/evaluate_cache/embedding_{}_{}_{}.npy'
+                .format(self.exp_id, self.model, self.dataset, self.output_dim),
                 encoder_state.detach().cpu().numpy())
         output = self.decoder(encoder_state)  # N, feature_dim
         return output

diff --git a/libcity/model/road_representation/GAT.py b/libcity/model/road_representation/GAT.py
@@ -37,6 +37,7 @@ def __init__(self, config, data_feature):
         self._scaler = self.data_feature.get('scaler')
 
         self.output_dim = config.get('output_dim', 32)
+        self.exp_id = config.get('exp_id', None)
         GATLayer = GATLayerImp3
         self.encoder = GATLayer(num_in_features=self.feature_dim, num_out_features=self.output_dim,
                                 num_of_heads=5, concat=False, device=self.device)
@@ -53,8 +54,8 @@ def forward(self, batch):
         """
         inputs = batch['node_features']
         encoder_state = self.encoder([inputs, self.Apt])[0]  # N, output_dim
-        np.save('./libcity/cache/evaluate_cache/embedding_{}_{}_{}.npy'
-                .format(self.model, self.dataset, self.output_dim),
+        np.save('./libcity/cache/{}/evaluate_cache/embedding_{}_{}_{}.npy'
+                .format(self.exp_id, self.model, self.dataset, self.output_dim),
                 encoder_state.detach().cpu().numpy())
         output = self.decoder([encoder_state, self.Apt])[0]  # N, feature_dim
         return output

diff --git a/libcity/model/road_representation/LINE.py b/libcity/model/road_representation/LINE.py
@@ -71,6 +71,7 @@ def __init__(self, config, data_feature):
 
         self.model = config.get('model', '')
         self.dataset = config.get('dataset', '')
+        self.exp_id = config.get('exp_id', None)
 
     def calculate_loss(self, batch):
         I, J, is_neg = batch['I'], batch['J'], batch['Neg']
@@ -88,7 +89,7 @@ def forward(self, I, J):
             elif order == 'second':
                 [u'_j^T * v_i for (i,j) in zip(I, J)]; (B,)
         """
-        np.save('./libcity/cache/evaluate_cache/embedding_{}_{}_{}.npy'
-                .format(self.model, self.dataset, self.output_dim),
+        np.save('./libcity/cache/{}/evaluate_cache/embedding_{}_{}_{}.npy'
+                .format(self.exp_id, self.model, self.dataset, self.output_dim),
                 self.embed.get_embeddings())
         return self.embed(I, J)
diff --git a/libcity/pipeline/pipeline.py b/libcity/pipeline/pipeline.py
@@ -7,7 +7,7 @@
 from ray.tune.suggest import ConcurrencyLimiter
 import json
 import torch
-
+import random
 from libcity.config import ConfigParser
 from libcity.data import get_dataset
 from libcity.utils import get_executor, get_model, get_logger, ensure_dir
@@ -29,18 +29,24 @@ def run_model(task=None, model_name=None, dataset_name=None, config_file=None,
     # load config
     config = ConfigParser(task, model_name, dataset_name,
                           config_file, saved_model, train, other_args)
+    exp_id = config.get('exp_id', None)
+    if exp_id is None:
+        # Make a new experiment ID
+        exp_id = int(random.SystemRandom().random() * 100000)
+        config['exp_id'] = exp_id
     # logger
     logger = get_logger(config)
-    logger.info('Begin pipeline, task={}, model_name={}, dataset_name={}'.
-                format(str(task), str(model_name), str(dataset_name)))
+    logger.info('Begin pipeline, task={}, model_name={}, dataset_name={}, exp_id={}'.
+                format(str(task), str(model_name), str(dataset_name), str(exp_id)))
+    logger.info(config.config)
     # 加载数据集
     dataset = get_dataset(config)
     # 转换数据，并划分数据集
     train_data, valid_data, test_data = dataset.get_data()
     data_feature = dataset.get_data_feature()
     # 加载执行器
-    model_cache_file = './libcity/cache/model_cache/{}_{}.m'.format(
-        model_name, dataset_name)
+    model_cache_file = './libcity/cache/{}/model_cache/{}_{}.m'.format(
+        exp_id, model_name, dataset_name)
     model = get_model(config, data_feature)
     executor = get_executor(config, model)
     # 训练

diff --git a/libcity/utils/utils.py b/libcity/utils/utils.py
@@ -116,8 +116,8 @@ def get_logger(config, name=None):
     log_dir = './libcity/log'
     if not os.path.exists(log_dir):
         os.makedirs(log_dir)
-    log_filename = '{}-{}-{}.log'.format(
-        config['model'], config['dataset'], get_local_time())
+    log_filename = '{}-{}-{}-{}.log'.format(config['exp_id'],
+                                            config['model'], config['dataset'], get_local_time())
     logfilepath = os.path.join(log_dir, log_filename)
 
     logger = logging.getLogger(name)

diff --git a/readme.md b/readme.md
@@ -54,7 +54,7 @@ More details about environment configuration is represented in [Docs](https://bi
 
 ## Quick-Start
 
-Before run models in LibCity, please make sure you download at least one dataset and put it in directory `./raw_data/`. The dataset link is [BaiduDisk with code 1231](https://pan.baidu.com/s/1qEfcXBO-QwZfiT0G3IYMpQ) or [Google Drive](https://drive.google.com/drive/folders/1g5v2Gq1tkOq8XO0HDCZ9nOTtRpB6-gPe?usp=sharing).
+Before run models in LibCity, please make sure you download at least one dataset and put it in directory `./raw_data/`. The dataset link is [BaiduDisk with code 1231](https://pan.baidu.com/s/1qEfcXBO-QwZfiT0G3IYMpQ) or [Google Drive](https://drive.google.com/drive/folders/1g5v2Gq1tkOq8XO0HDCZ9nOTtRpB6-gPe?usp=sharing). All dataset used in LibCity needs to be processed into the [atomic files](https://bigscity-libcity-docs.readthedocs.io/en/latest/user_guide/data/atomic_files.html) format.
 
 The script `run_model.py` is used for training and evaluating a single model in LibCity. When run the `run_model.py`, you must specify the following three parameters, namely **task**, **dataset** and **model**.  
 
@@ -68,6 +68,26 @@ This script will run the GRU model on the METR_LA dataset for traffic state pred
 
 More details is represented in [Docs](https://bigscity-libcity-docs.readthedocs.io/en/latest/get_started/quick_start.html).
 
+## TensorBoard Visualization
+
+During the model training process, LibCity will record the loss of each epoch, and support tensorboard visualization.
+
+After running the model once, you can use the following command to visualize:
+
+```shell
+tensorboard --logdir 'libcity/cache'
+```
+
+```
+TensorFlow installation not found - running with reduced feature set.
+Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
+TensorBoard 2.4.1 at http://localhost:6006/ (Press CTRL+C to quit)
+```
+
+Visit this address([http://localhost:6006/](http://localhost:6006/)) in the browser to see the visualized result.
+
+![](https://bigscity-libcity-docs.readthedocs.io/en/latest/_images/tensorboard.png)
+
 ## Reproduced Model List
 
 For a list of all models reproduced in LibCity, see [Docs](https://bigscity-libcity-docs.readthedocs.io/en/latest/user_guide/model.html), where you can see the abbreviation of the model and the corresponding papers and citations.
@@ -76,7 +96,7 @@ For a list of all models reproduced in LibCity, see [Docs](https://bigscity-libc
 
 In order to facilitate users to use LibCity, we provide users with some tutorials:
 
-- We gave lectures on both ACM SIGSPATIAL 2021 Main Track and Local Track. For related lecture videos and Slides, please see our [HomePage](https://libcity.ai/#/tutorial) (Chinese and English).
+- We gave lectures on both ACM SIGSPATIAL 2021 Main Track and Local Track. For related lecture videos and Slides, please see our [HomePage](https://libcity.ai/#/tutorial) (in Chinese and English).
 - We provide entry-level tutorials (in Chinese and English) in the documentation.
   - [Install and quick start](https://bigscity-libcity-docs.readthedocs.io/en/latest/tutorial/install_quick_start.html)  & [安装和快速上手](https://bigscity-libcity-docs.readthedocs.io/zh_CN/latest/tutorial/install_quick_start.html)
   - [Run an existing model in LibCity](https://bigscity-libcity-docs.readthedocs.io/en/latest/tutorial/run_model.html) & [运行LibCity中已复现的模型](https://bigscity-libcity-docs.readthedocs.io/zh_CN/latest/tutorial/run_model.html)

diff --git a/readme_zh.md b/readme_zh.md
@@ -54,7 +54,7 @@ cd Bigscity-LibCity
 
 ## Quick-Start
 
-在 LibCity 中运行模型之前，请确保您至少下载了一个数据集并将其放在目录 `./raw_data/` 中。 数据集链接是 [BaiduDisk with code 1231](https://pan.baidu.com/s/1qEfcXBO-QwZfiT0G3IYMpQ) 或 [Google Drive](https://drive.google.com/drive/folders/1g5v2Gq1tkOq8XO0HDCZ9nOTtRpB6-gPe?usp=sharing) 。
+在 LibCity 中运行模型之前，请确保您至少下载了一个数据集并将其放在目录 `./raw_data/` 中。 数据集链接是 [BaiduDisk with code 1231](https://pan.baidu.com/s/1qEfcXBO-QwZfiT0G3IYMpQ) 或 [Google Drive](https://drive.google.com/drive/folders/1g5v2Gq1tkOq8XO0HDCZ9nOTtRpB6-gPe?usp=sharing) 。LibCity 中所用的数据集需要被处理成[原子文件](https://bigscity-libcity-docs.readthedocs.io/zh_CN/latest/user_guide/data/atomic_files.html)的格式。
 
 脚本 `run_model.py` 用于在 LibCity 中训练和评估单个模型。 运行`run_model.py`时，必须指定以下三个参数，即**task、dataset和model**。例如：
 
@@ -66,9 +66,29 @@ python run_model.py --task traffic_state_pred --model GRU --dataset METR_LA
 
 更多细节请访问 [文档](https://bigscity-libcity-docs.readthedocs.io/zh_CN/latest/get_started/quick_start.html) 。
 
+## TensorBoard Visualization
+
+在模型训练过程中，LibCity 会记录每个 epoch 的损失，并支持 tensorboard 可视化。
+
+模型运行一次后，可以使用以下命令进行可视化：
+
+```shell
+tensorboard --logdir 'libcity/cache'
+```
+
+```
+TensorFlow installation not found - running with reduced feature set.
+Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
+TensorBoard 2.4.1 at http://localhost:6006/ (Press CTRL+C to quit)
+```
+
+在浏览器中访问这个地址（[http://localhost:6006/](http://localhost:6006/)） 可以看到可视化的结果。
+
+![](https://bigscity-libcity-docs.readthedocs.io/en/latest/_images/tensorboard.png)
+
 ## Reproduced Model List
 
-LibCity 中所复现的全部模型列表见[文档](https://bigscity-libcity-docs.readthedocs.io/en/latest/user_guide/model.html)，在这里你可以看到模型的简称和对应的论文及引用文献。
+LibCity 中所复现的全部模型列表见[文档](https://bigscity-libcity-docs.readthedocs.io/zh_CN/latest/user_guide/model.html)，在这里你可以看到模型的简称和对应的论文及引用文献。
 
 ## Tutorial
 

diff --git a/run_model.py b/run_model.py
@@ -36,6 +36,8 @@ def add_other_args(parser):
     parser.add_argument('--train', type=str2bool, default=True,
                         help='whether re-train model if the model is \
                              trained before')
+    parser.add_argument('--exp_id', type=str,
+                        default=None, help='id of experiment')
     # 增加其他可选的参数
     add_other_args(parser)
     # 解析参数