Skip to content

Commit

Permalink
fix embedding being a complex value
Browse files Browse the repository at this point in the history
  • Loading branch information
vinaysb committed May 14, 2024
1 parent 050c3e7 commit aee06fa
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 7 deletions.
14 changes: 12 additions & 2 deletions src/clep/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,14 +416,23 @@ def generate_network(
default=0.1,
show_default=True,
)
@click.option(
'--raw_embedding',
help='Flag to indicate if the embedding should be returned as is (default only returns the real part of a complex embedding)',
is_flag=True,
show_default=True,
default=False,
required=False,
)
def kge(
data: str,
design: str,
out: str,
model_config: str,
all_nodes: bool = False,
train_size: float = 0.8,
validation_size: float = 0.1
validation_size: float = 0.1,
raw_embedding: bool = False
) -> None:
"""Perform knowledge graph embedding."""
with open(model_config, 'r') as config_file:
Expand All @@ -444,7 +453,8 @@ def kge(
return_patients=(not all_nodes),
model_config=config,
train_size=train_size,
validation_size=validation_size
validation_size=validation_size,
complex_embedding=raw_embedding
)

embedding_df.to_csv(f'{out}/embedding.tsv', sep='\t')
Expand Down
19 changes: 14 additions & 5 deletions src/clep/embedding/kge.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def do_kge(
model_config: Dict[str, Any],
return_patients: bool = True,
train_size: float = 0.8,
validation_size: float = 0.1
validation_size: float = 0.1,
complex_embedding: bool = False
) -> pd.DataFrame:
"""Carry out KGE on the given data.
Expand All @@ -31,13 +32,15 @@ def do_kge(
:param return_patients: Flag to indicate if the final data should contain only patients or even the features
:param train_size: Size of the training data for KGE ranging from 0 - 1
:param validation_size: Size of the validation data for KGE ranging from 0 - 1. It must be lower than training size
:param complex_embedding: Flag to indicate if only the real part of the embedding should be returned.
:return: Dataframe containing the embedding from the KGE
"""
design_norm_df = design.astype(str, copy=True)

unique_nodes = edgelist[~edgelist['label'].isna()].drop_duplicates('source')

label_mapping = {patient: label for patient, label in zip(unique_nodes['source'], unique_nodes['label'])}
# Create a mapping of the patient to the label. The patient id is converted to string to avoid duplicates
label_mapping = {str(patient): label for patient, label in zip(unique_nodes['source'], unique_nodes['label'])}

edgelist = edgelist.drop(columns='label')

Expand Down Expand Up @@ -69,7 +72,7 @@ def do_kge(
best_model, triple_factory = pipeline_results.model, pipeline_results.training

# Get the embedding as a numpy array. Ignore the type as the model will be of type ERModel (Embedding model)
embedding_values = _model_to_numpy(best_model) # type: ignore
embedding_values = _model_to_numpy(best_model, complex=complex_embedding) # type: ignore

# Create columns as component names
embedding_columns = [f'Component_{i}' for i in range(1, embedding_values.shape[1] + 1)]
Expand Down Expand Up @@ -131,11 +134,17 @@ def _weighted_splitter(


def _model_to_numpy(
model: ERModel[HeadRepresentation, RelationRepresentation, TailRepresentation]
model: ERModel[HeadRepresentation, RelationRepresentation, TailRepresentation],
complex: bool = False
) -> npt.NDArray[np.float64 | np.float32]:
"""Retrieve embedding from the models as a numpy array."""
embedding_numpy: npt.NDArray[np.float64 | np.float32] = model.entity_representations[0](indices=None).detach().cpu().numpy()
return embedding_numpy

if complex:
return embedding_numpy

# Get the real part of the embedding for classification tasks
return embedding_numpy.real


def run_optimization(dataset: Tuple[str, str, str], model_config: Dict[str, Any], out_dir: str) -> None:
Expand Down

0 comments on commit aee06fa

Please sign in to comment.