Relphormer baseline
This commit is contained in:
commit
c0d0be076f
9
.gitignore
vendored
Normal file
9
.gitignore
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
output/
|
||||
pretrain/output/
|
||||
logs/
|
||||
pretrain/logs/
|
||||
dataset/FB15k-237/examples*.txt
|
||||
dataset/FB15k-237/masked_*.txt
|
||||
dataset/FB15k-237/cached_*.pkl
|
||||
**/__pycache__/
|
||||
**/.DS_Store
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2021 ZJUNLP
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
6
QA/__init__.py
Normal file
6
QA/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
|
||||
|
||||
from transformers import BartForConditionalGeneration, T5ForConditionalGeneration, GPT2LMHeadModel
|
||||
|
||||
from .model import *
|
||||
|
6
QA/fbqa/relphormer-filtered/dev-stat.json
Normal file
6
QA/fbqa/relphormer-filtered/dev-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 3994,
|
||||
"#kept_examples": 3994,
|
||||
"#mappable_examples": 743,
|
||||
"#multiple_answer_examples": 2
|
||||
}
|
8918
QA/fbqa/relphormer-filtered/dev.json
Normal file
8918
QA/fbqa/relphormer-filtered/dev.json
Normal file
File diff suppressed because it is too large
Load Diff
4665
QA/fbqa/relphormer-filtered/ids.txt
Normal file
4665
QA/fbqa/relphormer-filtered/ids.txt
Normal file
File diff suppressed because it is too large
Load Diff
6
QA/fbqa/relphormer-filtered/test-stat.json
Normal file
6
QA/fbqa/relphormer-filtered/test-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 3996,
|
||||
"#kept_examples": 3996,
|
||||
"#mappable_examples": 755,
|
||||
"#multiple_answer_examples": 0
|
||||
}
|
9062
QA/fbqa/relphormer-filtered/test.json
Normal file
9062
QA/fbqa/relphormer-filtered/test.json
Normal file
File diff suppressed because it is too large
Load Diff
6
QA/fbqa/relphormer-filtered/train-stat.json
Normal file
6
QA/fbqa/relphormer-filtered/train-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 20358,
|
||||
"#kept_examples": 20358,
|
||||
"#mappable_examples": 3713,
|
||||
"#multiple_answer_examples": 4
|
||||
}
|
44558
QA/fbqa/relphormer-filtered/train.json
Normal file
44558
QA/fbqa/relphormer-filtered/train.json
Normal file
File diff suppressed because it is too large
Load Diff
6
QA/fbqa/relphormer/dev-stat.json
Normal file
6
QA/fbqa/relphormer/dev-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 3994,
|
||||
"#kept_examples": 3994,
|
||||
"#mappable_examples": 743,
|
||||
"#multiple_answer_examples": 2
|
||||
}
|
51956
QA/fbqa/relphormer/dev.json
Normal file
51956
QA/fbqa/relphormer/dev.json
Normal file
File diff suppressed because it is too large
Load Diff
4665
QA/fbqa/relphormer/ids.txt
Normal file
4665
QA/fbqa/relphormer/ids.txt
Normal file
File diff suppressed because it is too large
Load Diff
6
QA/fbqa/relphormer/test-stat.json
Normal file
6
QA/fbqa/relphormer/test-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 3996,
|
||||
"#kept_examples": 3996,
|
||||
"#mappable_examples": 755,
|
||||
"#multiple_answer_examples": 0
|
||||
}
|
51931
QA/fbqa/relphormer/test.json
Normal file
51931
QA/fbqa/relphormer/test.json
Normal file
File diff suppressed because it is too large
Load Diff
6
QA/fbqa/relphormer/train-stat.json
Normal file
6
QA/fbqa/relphormer/train-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 20358,
|
||||
"#kept_examples": 20358,
|
||||
"#mappable_examples": 3713,
|
||||
"#multiple_answer_examples": 4
|
||||
}
|
265200
QA/fbqa/relphormer/train.json
Normal file
265200
QA/fbqa/relphormer/train.json
Normal file
File diff suppressed because it is too large
Load Diff
403
QA/hitter-bert.py
Normal file
403
QA/hitter-bert.py
Normal file
@ -0,0 +1,403 @@
|
||||
import json
|
||||
import math
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import BertTokenizer, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup, AutoConfig
|
||||
|
||||
import torch
|
||||
from torch import device, nn
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
|
||||
from pytorch_lightning.utilities.seed import seed_everything
|
||||
from transformers.tokenization_bert import BertTokenizerFast
|
||||
|
||||
from kge.model import KgeModel
|
||||
from kge.util.io import load_checkpoint
|
||||
from kge.util import sc
|
||||
# from relphormer.lit_models import TransformerLitModel
|
||||
from relphormer.models import BertKGC
|
||||
# from relphormer.data import KGC
|
||||
import os
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES']='4'
|
||||
|
||||
MODEL = 'bert-base-uncased'
|
||||
tokenizer = BertTokenizer.from_pretrained(MODEL)
|
||||
|
||||
|
||||
class FBQADataset(Dataset):
|
||||
|
||||
def __init__(self, file_dir):
|
||||
self.examples = json.load(Path(file_dir).open("rb"))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
if torch.is_tensor(idx):
|
||||
idx = idx.tolist()
|
||||
return self.examples[idx]
|
||||
|
||||
|
||||
def fbqa_collate(samples):
|
||||
questions = []
|
||||
answers = []
|
||||
answer_ids = []
|
||||
entities = []
|
||||
entity_names = []
|
||||
relations = []
|
||||
for item in samples:
|
||||
q = item["RawQuestion"] + "[MASK]" * len(item["AnswerEntity"]) + "."
|
||||
questions.append(q)
|
||||
answers.append(item["AnswerEntity"])
|
||||
answer_ids.append(item["AnswerEntityID"])
|
||||
entities.append(item["TopicEntityID"])
|
||||
entity_names.append(item["TopicEntityName"])
|
||||
relations.append(item["RelationID"])
|
||||
|
||||
questions = tokenizer(questions, return_tensors='pt', padding=True)
|
||||
entity_names = tokenizer(entity_names, add_special_tokens=False)
|
||||
answers, answers_lengths = sc.pad_seq_of_seq(answers)
|
||||
answers = torch.LongTensor(answers)
|
||||
answers_lengths = torch.LongTensor(answers_lengths)
|
||||
answer_ids = torch.LongTensor(answer_ids)
|
||||
|
||||
input_ids = questions['input_ids']
|
||||
masked_labels = torch.ones_like(input_ids) * -100
|
||||
masked_labels[input_ids == tokenizer.mask_token_id] = answers[answers != 0]
|
||||
entity_mask = torch.zeros_like(input_ids).bool()
|
||||
entity_span_index = input_ids.new_zeros((len(input_ids), 2))
|
||||
for i, e_tokens in enumerate(entity_names['input_ids']):
|
||||
q_tokens = input_ids[i].tolist()
|
||||
for s_index in range(len(q_tokens) - len(e_tokens)):
|
||||
if all([e_token == q_tokens[s_index + j] for j, e_token in enumerate(e_tokens)]):
|
||||
entity_mask[i][s_index:s_index + len(e_tokens)] = True
|
||||
entity_span_index[i][0] = s_index
|
||||
entity_span_index[i][1] = s_index + len(e_tokens) - 1
|
||||
break
|
||||
|
||||
entities = torch.LongTensor(entities)
|
||||
relations = torch.LongTensor(relations)
|
||||
|
||||
return questions.data, masked_labels, answers, answers_lengths, answer_ids, entities, relations, entity_mask, entity_span_index
|
||||
|
||||
|
||||
class SelfOutput(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def forward(self, hidden_states, input_tensor):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class CrossAttention(nn.Module):
|
||||
def __init__(self, config, ctx_hidden_size):
|
||||
super().__init__()
|
||||
self.self = CrossAttentionInternal(config, ctx_hidden_size)
|
||||
self.output = SelfOutput(config)
|
||||
self.config = config
|
||||
self.apply(self._init_weights)
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights """
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
self_outputs = self.self(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
attention_output = self.output(self_outputs[0], hidden_states)
|
||||
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
|
||||
return outputs
|
||||
|
||||
|
||||
class CrossAttentionInternal(nn.Module):
|
||||
def __init__(self, config, ctx_hidden_size):
|
||||
super().__init__()
|
||||
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
|
||||
raise ValueError(
|
||||
"The hidden size (%d) is not a multiple of the number of attention "
|
||||
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
|
||||
self.query = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
self.key = nn.Linear(ctx_hidden_size, self.all_head_size)
|
||||
self.value = nn.Linear(ctx_hidden_size, self.all_head_size)
|
||||
|
||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||
|
||||
def transpose_for_scores(self, x):
|
||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||
x = x.view(*new_x_shape)
|
||||
return x.permute(0, 2, 1, 3)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
|
||||
# If this is instantiated as a cross-attention module, the keys
|
||||
# and values come from an encoder; the attention mask needs to be
|
||||
# such that the encoder's padding tokens are not attended to.
|
||||
mixed_key_layer = self.key(encoder_hidden_states)
|
||||
mixed_value_layer = self.value(encoder_hidden_states)
|
||||
attention_mask = encoder_attention_mask
|
||||
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
attention_probs = self.dropout(attention_probs)
|
||||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
|
||||
context_layer = torch.matmul(attention_probs, value_layer)
|
||||
|
||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
|
||||
context_layer = context_layer.view(*new_context_layer_shape)
|
||||
|
||||
outputs = (context_layer, nn.Softmax(dim=-1)(attention_scores)) if output_attentions else (context_layer,)
|
||||
return outputs
|
||||
|
||||
|
||||
class CrossTrmFinetuner(pl.LightningModule):
|
||||
def __init__(self, hparams, bertmodel):
|
||||
super().__init__()
|
||||
self._hparams = hparams
|
||||
|
||||
self.lr = hparams['lr']
|
||||
self.weight_decay = hparams['weight_decay']
|
||||
|
||||
self.kg_dim = 320
|
||||
# self.bert = BertForMaskedLM.from_pretrained(MODEL)
|
||||
self.bert = bertmodel
|
||||
|
||||
if self._hparams['use_hitter']:
|
||||
self.kg_layer_num = 10
|
||||
self.cross_attentions = nn.ModuleList([CrossAttention(self.bert.config, self.kg_dim)
|
||||
for _ in range(self.kg_layer_num)])
|
||||
checkpoint = load_checkpoint('local/best/20200812-174221-trmeh-fb15k237-best/checkpoint_best.pt')
|
||||
self.hitter = KgeModel.create_from(checkpoint)
|
||||
|
||||
def forward(self, batch):
|
||||
sent_input, masked_labels, batch_labels, label_lens, answer_ids, s, p, entity_mask, entity_span_index = batch
|
||||
|
||||
if self._hparams['use_hitter']:
|
||||
# kg_masks: [bs, 1, 1, length]
|
||||
# kg_embeds: nlayer*[bs, length, dim]
|
||||
kg_embeds, kg_masks = self.hitter('get_hitter_repr', s, p)
|
||||
kg_attentions = [None] * 2 + [(self.cross_attentions[i], kg_embeds[(i + 2) // 2], kg_masks)
|
||||
for i in range(self.kg_layer_num)]
|
||||
else:
|
||||
kg_attentions = []
|
||||
|
||||
out = self.bert(kg_attentions=kg_attentions,
|
||||
output_attentions=True,
|
||||
output_hidden_states=True,
|
||||
return_dict=True,
|
||||
labels=masked_labels,
|
||||
**sent_input,
|
||||
)
|
||||
|
||||
return out
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
output = self(batch)
|
||||
loss = output.loss
|
||||
self.log('train_loss', loss, on_epoch=True, prog_bar=True)
|
||||
return {'loss': loss}
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
batch_inputs, masked_labels, batch_labels, label_lens, answer_ids, s, p, entity_mask, _ = batch
|
||||
output = self(batch)
|
||||
input_tokens = batch_inputs["input_ids"].clone()
|
||||
|
||||
logits = output.logits[masked_labels != -100]
|
||||
probs = logits.softmax(dim=-1)
|
||||
values, predictions = probs.topk(1)
|
||||
hits = []
|
||||
now_pos = 0
|
||||
for sample_i, label_length in enumerate(label_lens.tolist()):
|
||||
failed = False
|
||||
for i in range(label_length):
|
||||
if (predictions[now_pos + i] == batch_labels[sample_i][i]).sum() != 1:
|
||||
failed = True
|
||||
break
|
||||
hits += [1] if not failed else [0]
|
||||
now_pos += label_length
|
||||
hits = torch.tensor(hits)
|
||||
input_tokens[input_tokens == tokenizer.mask_token_id] = predictions.flatten()
|
||||
pred_strings = [str(hits[i].item()) + ' ' + tokenizer.decode(input_tokens[i], skip_special_tokens=True)
|
||||
for i in range(input_tokens.size(0))]
|
||||
|
||||
return {'val_loss': output.loss,
|
||||
'val_acc': hits.float(),
|
||||
'pred_strings': pred_strings}
|
||||
|
||||
def validation_epoch_end(self, outputs):
|
||||
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
|
||||
avg_val_acc = torch.cat([x['val_acc'] for x in outputs]).mean().to(avg_loss.device)
|
||||
|
||||
if self.global_rank == 0:
|
||||
tensorboard = self.logger.experiment
|
||||
tensorboard.add_text('pred', '\n\n'.join(sum([x['pred_strings'] for x in outputs], [])), self.global_step)
|
||||
|
||||
self.log('avg_loss', avg_loss, on_epoch=True, prog_bar=True, sync_dist=True)
|
||||
self.log('avg_val_acc', avg_val_acc, on_epoch=True, prog_bar=True, sync_dist=True)
|
||||
return {'val_loss': avg_loss}
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(FBQADataset(self._hparams['train_dataset']),
|
||||
self._hparams['batch_size'],
|
||||
shuffle=True,
|
||||
collate_fn=fbqa_collate,
|
||||
num_workers=0)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(FBQADataset(self._hparams['val_dataset']),
|
||||
1,
|
||||
shuffle=False,
|
||||
collate_fn=fbqa_collate,
|
||||
num_workers=0)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(FBQADataset(self._hparams['test_dataset']),
|
||||
1,
|
||||
shuffle=False,
|
||||
collate_fn=fbqa_collate,
|
||||
num_workers=0)
|
||||
|
||||
def configure_optimizers(self):
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
no_fine_tune = ['cross_attentions']
|
||||
pgs = [{'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) and not any([i in n for i in no_fine_tune])],
|
||||
'weight_decay': 0.01},
|
||||
{'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) and not any([i in n for i in no_fine_tune])],
|
||||
'weight_decay': 0.0}]
|
||||
if self._hparams['use_hitter']:
|
||||
pgs.append({'params': self.cross_attentions.parameters(), 'lr': 5e-5, 'weight_decay': 0.01})
|
||||
# bert_optimizer = AdamW(pgs, lr=3e-5, weight_decay=1e-2)
|
||||
bert_optimizer = AdamW(pgs, lr=self.lr, weight_decay=self.weight_decay)
|
||||
bert_scheduler = {
|
||||
'scheduler': get_linear_schedule_with_warmup(bert_optimizer, self._hparams['max_steps'] // 10, self._hparams['max_steps']),
|
||||
'interval': 'step',
|
||||
'monitor': None
|
||||
}
|
||||
return [bert_optimizer], [bert_scheduler]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--exp_name", default='default', nargs='?', help="Name of the experiment")
|
||||
parser.add_argument('--dataset', choices=['fbqa', 'webqsp'], default='fbqa', help="fbqa or webqsp")
|
||||
parser.add_argument('--filtered', default=False, action='store_true', help="Filtered or not")
|
||||
parser.add_argument('--hitter', default=False, action='store_true', help="Use pretrained HittER or not")
|
||||
parser.add_argument('--relphormer', default=False, action='store_true', help="Use pretrained relphormer or not")
|
||||
parser.add_argument('--seed', default=333, type=int, help='Seed number')
|
||||
parser.add_argument('--lr', default=3e-5, type=float, help='learning rate')
|
||||
parser.add_argument('--weight_decay', default=1e-2, type=float, help='weight decay')
|
||||
args = parser.parse_args()
|
||||
seed_everything(args.seed)
|
||||
|
||||
QA_DATASET = args.dataset
|
||||
if args.filtered and args.relphormer:
|
||||
SUBSET = 'relphormer-filtered'
|
||||
elif not args.filtered and args.relphormer:
|
||||
SUBSET = 'relphormer'
|
||||
elif args.filtered and not args.relphormer:
|
||||
SUBSET = 'fb15k237-filtered'
|
||||
else:
|
||||
SUBSET = 'fb15k237'
|
||||
|
||||
hparams = {
|
||||
'use_hitter': args.hitter,
|
||||
'relphormer': args.relphormer,
|
||||
'lr': args.lr,
|
||||
'weight_decay': args.weight_decay,
|
||||
'batch_size': 16,
|
||||
'max_epochs': 20,
|
||||
'train_dataset': f'data/{QA_DATASET}/{SUBSET}/train.json',
|
||||
'val_dataset': f'data/{QA_DATASET}/{SUBSET}/test.json',
|
||||
'test_dataset': f'data/{QA_DATASET}/{SUBSET}/test.json',
|
||||
}
|
||||
|
||||
if hparams['relphormer']:
|
||||
MODEL = "./local/relphormer/"
|
||||
config = AutoConfig.from_pretrained(MODEL)
|
||||
bertmodel = BertForMaskedLM.from_pretrained(MODEL, config=config)
|
||||
model = CrossTrmFinetuner(hparams, bertmodel=bertmodel)
|
||||
else:
|
||||
bertmodel = BertForMaskedLM.from_pretrained(MODEL)
|
||||
model = CrossTrmFinetuner(hparams, bertmodel=bertmodel)
|
||||
model.hparams['max_steps'] = (len(model.train_dataloader().dataset) // hparams['batch_size'] + 1) * hparams['max_epochs']
|
||||
base_path = '/tmp/hitbert-paper'
|
||||
logger = TensorBoardLogger(base_path, args.exp_name)
|
||||
checkpoint_callback = ModelCheckpoint(
|
||||
monitor='avg_val_acc',
|
||||
dirpath=base_path + '/' + args.exp_name,
|
||||
filename='{epoch:02d}-{avg_val_acc:.3f}',
|
||||
save_top_k=1,
|
||||
mode='max')
|
||||
trainer = pl.Trainer(gpus=1, accelerator="ddp",
|
||||
max_epochs=hparams['max_epochs'], max_steps=model.hparams['max_steps'],
|
||||
checkpoint_callback=True,
|
||||
gradient_clip_val=1.0, logger=logger,
|
||||
callbacks=[LearningRateMonitor(), checkpoint_callback])
|
||||
trainer.fit(model)
|
||||
print("QA Task End!")
|
8
QA/model.py
Normal file
8
QA/model.py
Normal file
@ -0,0 +1,8 @@
|
||||
# from transformers.models.bert.modeling_bert import BertForMaskedLM
|
||||
from models.huggingface_relformer import BertForMaskedLM
|
||||
class BertKGC(BertForMaskedLM):
|
||||
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
parser.add_argument("--pretrain", type=int, default=0, help="")
|
||||
return parser
|
10
QA/scripts/relphormer_fbqa.sh
Normal file
10
QA/scripts/relphormer_fbqa.sh
Normal file
@ -0,0 +1,10 @@
|
||||
for SEED in 111 222 333 444 555 666 777 888 999
|
||||
do
|
||||
# echo ${LR} ${WD}
|
||||
python hitter-bert.py --dataset fbqa \
|
||||
--relphormer \
|
||||
--seed ${SEED} \
|
||||
--exp_name relphormer-fbqa \
|
||||
--lr 3e-5 \
|
||||
--weight_decay 1e-2
|
||||
done
|
13
QA/scripts/relphormer_fbqa_filtered.sh
Normal file
13
QA/scripts/relphormer_fbqa_filtered.sh
Normal file
@ -0,0 +1,13 @@
|
||||
|
||||
for SEED in 111 222 333 444 555 666 777 888 999
|
||||
do
|
||||
|
||||
# echo ${LR} ${WD}
|
||||
python hitter-bert.py --dataset fbqa \
|
||||
--relphormer \
|
||||
--filtered \
|
||||
--seed ${SEED} \
|
||||
--exp_name relphormer-filtered-fbqa \
|
||||
--lr 3e-5 \
|
||||
--weight_decay 1e-2
|
||||
done
|
10
QA/scripts/relphormer_webqsp.sh
Normal file
10
QA/scripts/relphormer_webqsp.sh
Normal file
@ -0,0 +1,10 @@
|
||||
|
||||
for SEED in 222 333 444 555 666 777 888 999
|
||||
do
|
||||
python hitter-bert.py --dataset webqsp \
|
||||
--relphormer \
|
||||
--seed ${SEED} \
|
||||
--exp_name relphormer-webqsp \
|
||||
--lr 3e-5 \
|
||||
--weight_decay 1e-2
|
||||
done
|
12
QA/scripts/relphormer_webqsp_filtered.sh
Normal file
12
QA/scripts/relphormer_webqsp_filtered.sh
Normal file
@ -0,0 +1,12 @@
|
||||
|
||||
for SEED in 111 222 333 444 555 666 777 888 999
|
||||
do
|
||||
# echo ${LR} ${WD}
|
||||
python hitter-bert.py --dataset webqsp \
|
||||
--relphormer \
|
||||
--filtered \
|
||||
--seed ${SEED} \
|
||||
--exp_name relphormer-filtered-webqsp \
|
||||
--lr 3e-5 \
|
||||
--weight_decay 1e-2
|
||||
done
|
1159
QA/utils.py
Normal file
1159
QA/utils.py
Normal file
File diff suppressed because it is too large
Load Diff
1647
QA/webqsp/relphormer-filtered/ids.txt
Normal file
1647
QA/webqsp/relphormer-filtered/ids.txt
Normal file
File diff suppressed because it is too large
Load Diff
6
QA/webqsp/relphormer-filtered/test-stat.json
Normal file
6
QA/webqsp/relphormer-filtered/test-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 1639,
|
||||
"#kept_examples": 484,
|
||||
"#mappable_examples": 484,
|
||||
"#multiple_answer_examples": 800
|
||||
}
|
5810
QA/webqsp/relphormer-filtered/test.json
Normal file
5810
QA/webqsp/relphormer-filtered/test.json
Normal file
File diff suppressed because it is too large
Load Diff
6
QA/webqsp/relphormer-filtered/train-stat.json
Normal file
6
QA/webqsp/relphormer-filtered/train-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 3098,
|
||||
"#kept_examples": 850,
|
||||
"#mappable_examples": 850,
|
||||
"#multiple_answer_examples": 1437
|
||||
}
|
10202
QA/webqsp/relphormer-filtered/train.json
Normal file
10202
QA/webqsp/relphormer-filtered/train.json
Normal file
File diff suppressed because it is too large
Load Diff
1647
QA/webqsp/relphormer/ids.txt
Normal file
1647
QA/webqsp/relphormer/ids.txt
Normal file
File diff suppressed because it is too large
Load Diff
6
QA/webqsp/relphormer/test-stat.json
Normal file
6
QA/webqsp/relphormer/test-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 1639,
|
||||
"#kept_examples": 1582,
|
||||
"#mappable_examples": 484,
|
||||
"#multiple_answer_examples": 800
|
||||
}
|
20986
QA/webqsp/relphormer/test.json
Normal file
20986
QA/webqsp/relphormer/test.json
Normal file
File diff suppressed because it is too large
Load Diff
6
QA/webqsp/relphormer/train-stat.json
Normal file
6
QA/webqsp/relphormer/train-stat.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"#examples": 3098,
|
||||
"#kept_examples": 2997,
|
||||
"#mappable_examples": 850,
|
||||
"#multiple_answer_examples": 1437
|
||||
}
|
39858
QA/webqsp/relphormer/train.json
Normal file
39858
QA/webqsp/relphormer/train.json
Normal file
File diff suppressed because it is too large
Load Diff
115
README.md
Normal file
115
README.md
Normal file
@ -0,0 +1,115 @@
|
||||
# Relphormer
|
||||
|
||||
Code for the paper: "Relphormer: Relational Graph Transformer for Knowledge Graph Representations".
|
||||
|
||||
> Transformers have achieved remarkable performance in widespread fields, including natural language processing, computer vision and graph mining. However, vanilla Transformer architectures have not yielded promising improvements in the Knowledge Graph (KG) representations, where the translational distance paradigm dominates this area. Note that vanilla Transformer architectures struggle to capture the intrinsically heterogeneous semantic and structural information of knowledge graphs. To this end, we propose a new variant of Transformer for knowledge graph representations dubbed Relphormer. Specifically, we introduce Triple2Seq which can dynamically sample contextualized sub-graph sequences as the input to alleviate the heterogeneity issue. We propose a novel structure-enhanced self-attention mechanism to encode the relational information and keep the globally semantic information among sub-graphs. Moreover, we propose masked knowledge modeling as a new paradigm for knowledge graph representation learning. We apply Relphormer to three tasks, namely, knowledge graph completion, KG-based question answering and KG-based recommendation for evaluation. Experimental results show that Relphormer can obtain better performance on benchmark datasets compared with baselines.
|
||||
|
||||
|
||||
# Model Architecture
|
||||
|
||||
<div align=center>
|
||||
<img src="./resource/model.png" width="85%" height="75%" />
|
||||
</div>
|
||||
|
||||
|
||||
The model architecture of Relphormer.
|
||||
The contextualized sub-graph is sampled with Triple2Seq, and then it will be converted into sequences while maintaining its sub-graph structure.
|
||||
Next, we conduct masked knowledge modeling, which randomly masks the nodes in the center triple in the contextualized sub-graph sequences.
|
||||
For the transformer architecture, we design a novel structure-enhanced mechanism to preserve the structure feature.
|
||||
Finally, we utilize our pre-trained KG transformer for KG-based downstream tasks.
|
||||
|
||||
# Environments
|
||||
|
||||
- python (3.8.13)
|
||||
- cuda(11.2)
|
||||
- Ubuntu-18.04.6 (4.15.0-156-generic)
|
||||
|
||||
# Requirements
|
||||
|
||||
To run the codes, you need to install the requirements:
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
The expected structure of files is:
|
||||
|
||||
```
|
||||
── Relphormer
|
||||
├── data
|
||||
├── dataset
|
||||
│ ├── FB15k-237
|
||||
│ ├── WN18RR
|
||||
│ ├── umls
|
||||
│ ├── create_neighbor.py
|
||||
├── lit_models
|
||||
│ ├── _init_.py
|
||||
│ ├── base.py
|
||||
│ ├── transformer.py
|
||||
│ └── utils.py
|
||||
├── models
|
||||
│ ├── _init_.py
|
||||
│ ├── huggingface_relformer.py
|
||||
│ ├── model.py
|
||||
│ └── utils.py
|
||||
├── resource
|
||||
│ └── model.png
|
||||
├── scripts
|
||||
│ ├── fb15k-237
|
||||
│ ├── wn18rr
|
||||
│ └── umls
|
||||
├── QA
|
||||
├── logs
|
||||
├── main.py
|
||||
└── requirements.txt
|
||||
```
|
||||
|
||||
# How to run
|
||||
|
||||
## KGC Task
|
||||
|
||||
### Generate Masked Neighbors
|
||||
|
||||
- Use the command below to generate the masked neighbors.
|
||||
```shell
|
||||
>> cd dataset
|
||||
>> python create_neighbor.py --dataset xxx # like python create_neighbor.py --dataset umls
|
||||
```
|
||||
|
||||
### Entity Embedding Initialization
|
||||
|
||||
- Then use the command below to add entities to BERT and initialize the entity embedding layer to be used in the later training. For other datasets `FB15k-237` and `WN18RR` , just replace the dataset name with `fb15k-237` and `wn18rr` will be fine.
|
||||
|
||||
```shell
|
||||
>> cd pretrain
|
||||
>> mkdir logs
|
||||
>> bash scripts/pretrain_umls.sh
|
||||
>> tail -f -n 2000 logs/pretrain_umls.log
|
||||
```
|
||||
|
||||
The pretrained models are saved in the `Relphormer/pretrain/output` directory.
|
||||
|
||||
### Entity Prediction
|
||||
|
||||
- Next use the command below to train the model to predict the correct entity in the masked position. Same as above for other datasets.
|
||||
|
||||
```shell
|
||||
>> cd Relphormer
|
||||
>> mkdir logs
|
||||
>> bash scripts/umls/umls.sh
|
||||
>> tail -f -n 2000 logs/train_umls.log
|
||||
```
|
||||
|
||||
The trained models are saved in the `Relphormer/output` directory.
|
||||
|
||||
## QA Task
|
||||
The experimental settings in QA follow the [Hitter](https://arxiv.org/pdf/2008.12813.pdf) experimental settings, and the environment installation can be done by referring to [GitHub](https://github.com/microsoft/HittER). We only modified **hitter-best.py** to fit our model.
|
||||
|
||||
- The relphormer model used by QA can be downloaded [here](https://drive.google.com/file/d/1FK_A_kFq1ECoNm75RfkcvYv8rZiJL1Bw/view?usp=sharing).
|
||||
|
||||
```shell
|
||||
>> cd QA
|
||||
>> sh scripts/relphormer_fbqa.sh
|
||||
>> sh scripts/relphormer_fbqa_filtered.sh
|
||||
>> sh scripts/relphormer_webqsp.sh
|
||||
>> sh scripts/relphormer_webqsp_filtered.sh
|
||||
```
|
2
data/__init__.py
Normal file
2
data/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from .data_module import KGC
|
||||
from .processor import convert_examples_to_features, KGProcessor
|
63
data/algos.pyx
Normal file
63
data/algos.pyx
Normal file
@ -0,0 +1,63 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import cython
|
||||
from cython.parallel cimport prange, parallel
|
||||
cimport numpy
|
||||
import numpy
|
||||
|
||||
def floyd_warshall(adjacency_matrix):
|
||||
|
||||
(nrows, ncols) = adjacency_matrix.shape
|
||||
assert nrows == ncols
|
||||
cdef unsigned int n = nrows
|
||||
|
||||
adj_mat_copy = adjacency_matrix.astype(long, order='C', casting='safe', copy=True)
|
||||
assert adj_mat_copy.flags['C_CONTIGUOUS']
|
||||
cdef numpy.ndarray[long, ndim=2, mode='c'] M = adj_mat_copy
|
||||
cdef numpy.ndarray[long, ndim=2, mode='c'] path = numpy.zeros([n, n], dtype=numpy.int64)
|
||||
|
||||
cdef unsigned int i, j, k
|
||||
cdef long M_ij, M_ik, cost_ikkj
|
||||
cdef long* M_ptr = &M[0,0]
|
||||
cdef long* M_i_ptr
|
||||
cdef long* M_k_ptr
|
||||
|
||||
# set unreachable nodes distance to 510
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
if i == j:
|
||||
M[i][j] = 0
|
||||
elif M[i][j] == 0:
|
||||
M[i][j] = 510
|
||||
|
||||
# floyed algo
|
||||
for k in range(n):
|
||||
M_k_ptr = M_ptr + n*k
|
||||
for i in range(n):
|
||||
M_i_ptr = M_ptr + n*i
|
||||
M_ik = M_i_ptr[k]
|
||||
for j in range(n):
|
||||
cost_ikkj = M_ik + M_k_ptr[j]
|
||||
M_ij = M_i_ptr[j]
|
||||
if M_ij > cost_ikkj:
|
||||
M_i_ptr[j] = cost_ikkj
|
||||
path[i][j] = k
|
||||
|
||||
# set unreachable path to 510
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
if M[i][j] >= 510:
|
||||
path[i][j] = 510
|
||||
M[i][j] = 510
|
||||
|
||||
return M, path
|
||||
|
||||
|
||||
def get_all_edges(path, i, j):
|
||||
cdef unsigned int k = path[i][j]
|
||||
if k == 0:
|
||||
return []
|
||||
else:
|
||||
return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
|
||||
|
71
data/base_data_module.py
Normal file
71
data/base_data_module.py
Normal file
@ -0,0 +1,71 @@
|
||||
"""Base DataModule class."""
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import pytorch_lightning as pl
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
|
||||
class Config(dict):
|
||||
def __getattr__(self, name):
|
||||
return self.get(name)
|
||||
|
||||
def __setattr__(self, name, val):
|
||||
self[name] = val
|
||||
|
||||
|
||||
BATCH_SIZE = 8
|
||||
NUM_WORKERS = 8
|
||||
|
||||
|
||||
class BaseDataModule(pl.LightningDataModule):
|
||||
"""
|
||||
Base DataModule.
|
||||
Learn more at https://pytorch-lightning.readthedocs.io/en/stable/datamodules.html
|
||||
"""
|
||||
|
||||
def __init__(self, args: argparse.Namespace = None) -> None:
|
||||
super().__init__()
|
||||
self.args = Config(vars(args)) if args is not None else {}
|
||||
self.batch_size = self.args.get("batch_size", BATCH_SIZE)
|
||||
self.num_workers = self.args.get("num_workers", NUM_WORKERS)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
parser.add_argument(
|
||||
"--batch_size", type=int, default=BATCH_SIZE, help="Number of examples to operate on per forward step."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_workers", type=int, default=0, help="Number of additional processes to load data."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset", type=str, default="./dataset/NELL", help="Number of additional processes to load data."
|
||||
)
|
||||
return parser
|
||||
|
||||
def prepare_data(self):
|
||||
"""
|
||||
Use this method to do things that might write to disk or that need to be done only from a single GPU in distributed settings (so don't set state `self.x = y`).
|
||||
"""
|
||||
pass
|
||||
|
||||
def setup(self, stage=None):
|
||||
"""
|
||||
Split into train, val, test, and set dims.
|
||||
Should assign `torch Dataset` objects to self.data_train, self.data_val, and optionally self.data_test.
|
||||
"""
|
||||
self.data_train = None
|
||||
self.data_val = None
|
||||
self.data_test = None
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.data_train, shuffle=True, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.data_val, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.data_test, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
|
195
data/data_module.py
Normal file
195
data/data_module.py
Normal file
@ -0,0 +1,195 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
|
||||
from enum import Enum
|
||||
import torch
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoTokenizer, BertTokenizer
|
||||
# from transformers.configuration_bert import BertTokenizer, BertTokenizerFast
|
||||
from transformers.tokenization_utils_base import (BatchEncoding,
|
||||
PreTrainedTokenizerBase)
|
||||
|
||||
from .base_data_module import BaseDataModule
|
||||
from .processor import KGProcessor, get_dataset
|
||||
import transformers
|
||||
transformers.logging.set_verbosity_error()
|
||||
|
||||
class ExplicitEnum(Enum):
|
||||
"""
|
||||
Enum with more explicit error message for missing values.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def _missing_(cls, value):
|
||||
raise ValueError(
|
||||
f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
|
||||
)
|
||||
|
||||
class PaddingStrategy(ExplicitEnum):
|
||||
"""
|
||||
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
|
||||
in an IDE.
|
||||
"""
|
||||
|
||||
LONGEST = "longest"
|
||||
MAX_LENGTH = "max_length"
|
||||
DO_NOT_PAD = "do_not_pad"
|
||||
|
||||
import numpy as np
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForSeq2Seq:
|
||||
"""
|
||||
Data collator that will dynamically pad the inputs received, as well as the labels.
|
||||
|
||||
Args:
|
||||
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
|
||||
The tokenizer used for encoding the data.
|
||||
model (:class:`~transformers.PreTrainedModel`):
|
||||
The model that is being trained. If set and has the `prepare_decoder_input_ids_from_labels`, use it to
|
||||
prepare the `decoder_input_ids`
|
||||
|
||||
This is useful when using `label_smoothing` to avoid calculating loss twice.
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||
among:
|
||||
|
||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
||||
sequence is provided).
|
||||
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
||||
maximum acceptable input length for the model if that argument is not provided.
|
||||
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
||||
different lengths).
|
||||
max_length (:obj:`int`, `optional`):
|
||||
Maximum length of the returned list and optionally padding length (see above).
|
||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||
If set will pad the sequence to a multiple of the provided value.
|
||||
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
|
||||
7.5 (Volta).
|
||||
label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
|
||||
The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
|
||||
"""
|
||||
|
||||
tokenizer: PreTrainedTokenizerBase
|
||||
model: Optional[Any] = None
|
||||
padding: Union[bool, str, PaddingStrategy] = True
|
||||
max_length: Optional[int] = None
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
label_pad_token_id: int = -100
|
||||
return_tensors: str = "pt"
|
||||
num_labels: int = 0
|
||||
|
||||
def __call__(self, features, return_tensors=None):
|
||||
|
||||
if return_tensors is None:
|
||||
return_tensors = self.return_tensors
|
||||
labels = [feature.pop("labels") for feature in features] if "labels" in features[0].keys() else None
|
||||
label = [feature.pop("label") for feature in features]
|
||||
features_keys = {}
|
||||
name_keys = list(features[0].keys())
|
||||
for k in name_keys:
|
||||
# ignore the padding arguments
|
||||
if k in ["input_ids", "attention_mask", "token_type_ids"]: continue
|
||||
try:
|
||||
features_keys[k] = [feature.pop(k) for feature in features]
|
||||
except KeyError:
|
||||
continue
|
||||
|
||||
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
|
||||
# same length to return tensors.
|
||||
bsz = len(labels)
|
||||
with torch.no_grad():
|
||||
new_labels = torch.zeros(bsz, self.num_labels)
|
||||
for i,l in enumerate(labels):
|
||||
if isinstance(l, int):
|
||||
new_labels[i][l] = 1
|
||||
else:
|
||||
for j in l:
|
||||
new_labels[i][j] = 1
|
||||
labels = new_labels
|
||||
|
||||
features = self.tokenizer.pad(
|
||||
features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
features['labels'] = labels
|
||||
features['label'] = torch.tensor(label)
|
||||
features.update(features_keys)
|
||||
|
||||
return features
|
||||
|
||||
|
||||
|
||||
class KGC(BaseDataModule):
|
||||
def __init__(self, args, model) -> None:
|
||||
super().__init__(args)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.args.model_name_or_path, use_fast=False)
|
||||
self.processor = KGProcessor(self.tokenizer, args)
|
||||
self.label_list = self.processor.get_labels(args.data_dir)
|
||||
|
||||
entity_list = self.processor.get_entities(args.data_dir)
|
||||
|
||||
num_added_tokens = self.tokenizer.add_special_tokens({'additional_special_tokens': entity_list})
|
||||
self.sampler = DataCollatorForSeq2Seq(self.tokenizer,
|
||||
model=model,
|
||||
label_pad_token_id=self.tokenizer.pad_token_id,
|
||||
pad_to_multiple_of=8 if self.args.precision == 16 else None,
|
||||
padding="longest",
|
||||
max_length=self.args.max_seq_length,
|
||||
num_labels = len(entity_list),
|
||||
)
|
||||
relations_tokens = self.processor.get_relations(args.data_dir)
|
||||
self.num_relations = len(relations_tokens)
|
||||
num_added_tokens = self.tokenizer.add_special_tokens({'additional_special_tokens': relations_tokens})
|
||||
|
||||
vocab = self.tokenizer.get_added_vocab()
|
||||
self.relation_id_st = vocab[relations_tokens[0]]
|
||||
self.relation_id_ed = vocab[relations_tokens[-1]] + 1
|
||||
self.entity_id_st = vocab[entity_list[0]]
|
||||
self.entity_id_ed = vocab[entity_list[-1]] + 1
|
||||
|
||||
|
||||
def setup(self, stage=None):
|
||||
self.data_train = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "train")
|
||||
self.data_val = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "dev")
|
||||
self.data_test = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "test")
|
||||
|
||||
def prepare_data(self):
|
||||
pass
|
||||
|
||||
def get_config(self):
|
||||
d = {}
|
||||
for k, v in self.__dict__.items():
|
||||
if "st" in k or "ed" in k:
|
||||
d.update({k:v})
|
||||
|
||||
return d
|
||||
|
||||
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
BaseDataModule.add_to_argparse(parser)
|
||||
parser.add_argument("--model_name_or_path", type=str, default="roberta-base", help="the name or the path to the pretrained model")
|
||||
parser.add_argument("--data_dir", type=str, default="roberta-base", help="the name or the path to the pretrained model")
|
||||
parser.add_argument("--max_seq_length", type=int, default=256, help="Number of examples to operate on per forward step.")
|
||||
parser.add_argument("--warm_up_radio", type=float, default=0.1, help="Number of examples to operate on per forward step.")
|
||||
parser.add_argument("--eval_batch_size", type=int, default=8)
|
||||
parser.add_argument("--overwrite_cache", action="store_true", default=False)
|
||||
return parser
|
||||
|
||||
def get_tokenizer(self):
|
||||
return self.tokenizer
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.data_train, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.batch_size, shuffle=not self.args.faiss_init)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.data_val, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.eval_batch_size)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.data_test, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.eval_batch_size)
|
||||
|
954
data/processor.py
Normal file
954
data/processor.py
Normal file
@ -0,0 +1,954 @@
|
||||
from hashlib import new
|
||||
from re import DEBUG
|
||||
|
||||
import contextlib
|
||||
import sys
|
||||
|
||||
from collections import Counter
|
||||
from multiprocessing import Pool
|
||||
from torch._C import HOIST_CONV_PACKED_PARAMS
|
||||
from torch.utils.data import Dataset, Sampler, IterableDataset
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
from multiprocessing import Pool
|
||||
import os
|
||||
import random
|
||||
import json
|
||||
import torch
|
||||
import copy
|
||||
import numpy as np
|
||||
import pickle
|
||||
from tqdm import tqdm
|
||||
from dataclasses import dataclass, asdict, replace
|
||||
import inspect
|
||||
|
||||
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
||||
|
||||
from models.utils import get_entity_spans_pre_processing
|
||||
import pyximport
|
||||
|
||||
pyximport.install(setup_args={'include_dirs': np.get_include()})
|
||||
import data.algos as algos
|
||||
|
||||
def lmap(a, b):
|
||||
return list(map(a,b)) # a是个函数,b是个值列表,返回函数值列表
|
||||
|
||||
def cache_results(_cache_fp, _refresh=False, _verbose=1):
|
||||
r"""
|
||||
cache_results是fastNLP中用于cache数据的装饰器。通过下面的例子看一下如何使用::
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
from fastNLP import cache_results
|
||||
|
||||
@cache_results('cache.pkl')
|
||||
def process_data():
|
||||
# 一些比较耗时的工作,比如读取数据,预处理数据等,这里用time.sleep()代替耗时
|
||||
time.sleep(1)
|
||||
return np.random.randint(10, size=(5,))
|
||||
|
||||
start_time = time.time()
|
||||
print("res =",process_data())
|
||||
print(time.time() - start_time)
|
||||
|
||||
start_time = time.time()
|
||||
print("res =",process_data())
|
||||
print(time.time() - start_time)
|
||||
|
||||
# 输出内容如下,可以看到两次结果相同,且第二次几乎没有花费时间
|
||||
# Save cache to cache.pkl.
|
||||
# res = [5 4 9 1 8]
|
||||
# 1.0042750835418701
|
||||
# Read cache from cache.pkl.
|
||||
# res = [5 4 9 1 8]
|
||||
# 0.0040721893310546875
|
||||
|
||||
可以看到第二次运行的时候,只用了0.0001s左右,是由于第二次运行将直接从cache.pkl这个文件读取数据,而不会经过再次预处理::
|
||||
|
||||
# 还是以上面的例子为例,如果需要重新生成另一个cache,比如另一个数据集的内容,通过如下的方式调用即可
|
||||
process_data(_cache_fp='cache2.pkl') # 完全不影响之前的‘cache.pkl'
|
||||
|
||||
上面的_cache_fp是cache_results会识别的参数,它将从'cache2.pkl'这里缓存/读取数据,即这里的'cache2.pkl'覆盖默认的
|
||||
'cache.pkl'。如果在你的函数前面加上了@cache_results()则你的函数会增加三个参数[_cache_fp, _refresh, _verbose]。
|
||||
上面的例子即为使用_cache_fp的情况,这三个参数不会传入到你的函数中,当然你写的函数参数名也不可能包含这三个名称::
|
||||
|
||||
process_data(_cache_fp='cache2.pkl', _refresh=True) # 这里强制重新生成一份对预处理的cache。
|
||||
# _verbose是用于控制输出信息的,如果为0,则不输出任何内容;如果为1,则会提醒当前步骤是读取的cache还是生成了新的cache
|
||||
|
||||
:param str _cache_fp: 将返回结果缓存到什么位置;或从什么位置读取缓存。如果为None,cache_results没有任何效用,除非在
|
||||
函数调用的时候传入_cache_fp这个参数。
|
||||
:param bool _refresh: 是否重新生成cache。
|
||||
:param int _verbose: 是否打印cache的信息。
|
||||
:return:
|
||||
"""
|
||||
|
||||
def wrapper_(func):
|
||||
signature = inspect.signature(func)
|
||||
for key, _ in signature.parameters.items():
|
||||
if key in ('_cache_fp', '_refresh', '_verbose'):
|
||||
raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key))
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
my_args = args[0]
|
||||
mode = args[-1]
|
||||
if '_cache_fp' in kwargs:
|
||||
cache_filepath = kwargs.pop('_cache_fp')
|
||||
assert isinstance(cache_filepath, str), "_cache_fp can only be str."
|
||||
else:
|
||||
cache_filepath = _cache_fp
|
||||
if '_refresh' in kwargs:
|
||||
refresh = kwargs.pop('_refresh')
|
||||
assert isinstance(refresh, bool), "_refresh can only be bool."
|
||||
else:
|
||||
refresh = _refresh
|
||||
if '_verbose' in kwargs:
|
||||
verbose = kwargs.pop('_verbose')
|
||||
assert isinstance(verbose, int), "_verbose can only be integer."
|
||||
else:
|
||||
verbose = _verbose
|
||||
refresh_flag = True
|
||||
|
||||
model_name = my_args.model_name_or_path.split("/")[-1]
|
||||
is_pretrain = my_args.pretrain
|
||||
cache_filepath = os.path.join(my_args.data_dir, f"cached_{mode}_features{model_name}_pretrain{is_pretrain}_faiss{my_args.faiss_init}_seqlength{my_args.max_seq_length}_{my_args.litmodel_class}.pkl")
|
||||
refresh = my_args.overwrite_cache
|
||||
|
||||
if cache_filepath is not None and refresh is False:
|
||||
# load data
|
||||
if os.path.exists(cache_filepath):
|
||||
with open(cache_filepath, 'rb') as f:
|
||||
results = pickle.load(f)
|
||||
if verbose == 1:
|
||||
logger.info("Read cache from {}.".format(cache_filepath))
|
||||
refresh_flag = False
|
||||
|
||||
if refresh_flag:
|
||||
results = func(*args, **kwargs)
|
||||
if cache_filepath is not None:
|
||||
if results is None:
|
||||
raise RuntimeError("The return value is None. Delete the decorator.")
|
||||
with open(cache_filepath, 'wb') as f:
|
||||
pickle.dump(results, f)
|
||||
logger.info("Save cache to {}.".format(cache_filepath))
|
||||
|
||||
return results
|
||||
|
||||
return wrapper
|
||||
|
||||
return wrapper_
|
||||
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
# from torch.nn import CrossEntropyLoss, MSELoss
|
||||
# from scipy.stats import pearsonr, spearmanr
|
||||
# from sklearn.metrics import matthews_corrcoef, f1_scoreclass
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class InputExample(object):
|
||||
"""A single training/test example for simple sequence classification."""
|
||||
|
||||
def __init__(self, guid, text_a, text_b=None, text_c=None, text_d=None, label=None, real_label=None, en=None, en_id=None, rel=None, text_d_id=None, graph_inf=None):
|
||||
"""Constructs a InputExample.
|
||||
|
||||
Args:
|
||||
guid: Unique id for the example.
|
||||
text_a: string. The untokenized text of the first sequence. For single
|
||||
sequence tasks, only this sequence must be specified.
|
||||
text_b: (Optional) string. The untokenized text of the second sequence.
|
||||
Only must be specified for sequence pair tasks.
|
||||
text_c: (Optional) string. The untokenized text of the third sequence.
|
||||
Only must be specified for sequence triple tasks.
|
||||
label: (Optional) string. list of entities
|
||||
"""
|
||||
self.guid = guid
|
||||
self.text_a = text_a
|
||||
self.text_b = text_b
|
||||
self.text_c = text_c
|
||||
self.text_d = text_d
|
||||
self.label = label
|
||||
self.real_label = real_label
|
||||
self.en = en
|
||||
self.rel = rel # rel id
|
||||
self.text_d_id = text_d_id
|
||||
self.graph_inf = graph_inf
|
||||
self.en_id = en_id
|
||||
|
||||
|
||||
@dataclass
|
||||
class InputFeatures:
|
||||
"""A single set of features of data."""
|
||||
|
||||
input_ids: torch.Tensor
|
||||
attention_mask: torch.Tensor
|
||||
labels: torch.Tensor = None
|
||||
label: torch.Tensor = None
|
||||
en: torch.Tensor = 0
|
||||
rel: torch.Tensor = 0
|
||||
pos: torch.Tensor = 0
|
||||
graph: torch.Tensor = 0
|
||||
distance_attention: torch.Tensor = 0
|
||||
# attention_bias: torch.Tensor = 0
|
||||
|
||||
|
||||
class DataProcessor(object):
|
||||
"""Base class for data converters for sequence classification data sets."""
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""Gets a collection of `InputExample`s for the train set."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""Gets a collection of `InputExample`s for the dev set."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_labels(self, data_dir):
|
||||
"""Gets the list of labels for this data set."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@classmethod
|
||||
def _read_tsv(cls, input_file, quotechar=None):
|
||||
"""Reads a tab separated value file."""
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
import copy
|
||||
|
||||
|
||||
def solve_get_knowledge_store(line, set_type="train", pretrain=1):
|
||||
"""
|
||||
use the LM to get the entity embedding.
|
||||
Transductive: triples + text description
|
||||
Inductive: text description
|
||||
|
||||
"""
|
||||
examples = []
|
||||
|
||||
head_ent_text = ent2text[line[0]]
|
||||
tail_ent_text = ent2text[line[2]]
|
||||
relation_text = rel2text[line[1]]
|
||||
|
||||
i=0
|
||||
|
||||
a = tail_filter_entities["\t".join([line[0],line[1]])]
|
||||
b = head_filter_entities["\t".join([line[2],line[1]])]
|
||||
|
||||
guid = "%s-%s" % (set_type, i)
|
||||
text_a = head_ent_text
|
||||
text_b = relation_text
|
||||
text_c = tail_ent_text
|
||||
|
||||
# use the description of c to predict A
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[PAD]", text_b=text_b + "[PAD]", text_c = "[PAD]" + " " + text_c, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[ent2id[line[0]], rel2id[line[1]], ent2id[line[2]]], rel=0)
|
||||
)
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[PAD]", text_b=text_b + "[PAD]", text_c = "[PAD]" + " " + text_a, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[2]], en=[ent2id[line[0]], rel2id[line[1]], ent2id[line[2]]], rel=0)
|
||||
)
|
||||
return examples
|
||||
|
||||
|
||||
def solve(line, set_type="train", pretrain=1, max_triplet=32):
|
||||
examples = []
|
||||
|
||||
head_ent_text = ent2text[line[0]]
|
||||
tail_ent_text = ent2text[line[2]]
|
||||
relation_text = rel2text[line[1]]
|
||||
|
||||
i=0
|
||||
|
||||
a = tail_filter_entities["\t".join([line[0],line[1]])]
|
||||
b = head_filter_entities["\t".join([line[2],line[1]])]
|
||||
|
||||
guid = "%s-%s" % (set_type, i)
|
||||
text_a = head_ent_text
|
||||
text_b = relation_text
|
||||
text_c = tail_ent_text
|
||||
|
||||
|
||||
if pretrain:
|
||||
text_a_tokens = text_a.split()
|
||||
for i in range(10):
|
||||
st = random.randint(0, len(text_a_tokens))
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[MASK]", text_b=" ".join(text_a_tokens[st:min(st+64, len(text_a_tokens))]), text_c = "", label=ent2id[line[0]], real_label=ent2id[line[0]], en=0, rel=0)
|
||||
)
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[MASK]", text_b=text_a, text_c = "", label=ent2id[line[0]], real_label=ent2id[line[0]], en=0, rel=0)
|
||||
)
|
||||
# examples.append(
|
||||
# InputExample(guid=guid, text_a="[MASK]", text_b=text_c, text_c = "", label=ent2id[line[2]], real_label=ent2id[line[2]], en=0, rel=0)
|
||||
# )
|
||||
else:
|
||||
# 主要是对text_c进行包装,不再是原来的文本,而是对应子图的graph(变量graph_seq)。如果mask的是尾实体,那么就让text_c在后面加入graph_seq
|
||||
# masked_head_seq = []
|
||||
# masked_tail_seq = []
|
||||
# masked_tail_graph_list = masked_tail_neighbor["\t".join([line[0],line[1]])]
|
||||
# masked_head_graph_list = masked_head_neighbor["\t".join([line[2],line[1]])]
|
||||
# for item in masked_head_graph_list:
|
||||
# masked_head_seq.append(ent2id[item[0]])
|
||||
# masked_head_seq.append(rel2id[item[1]])
|
||||
# masked_head_seq.append(ent2id[item[2]])
|
||||
|
||||
# for item in masked_tail_graph_list:
|
||||
# masked_tail_seq.append(ent2id[item[0]])
|
||||
# masked_tail_seq.append(rel2id[item[1]])
|
||||
# masked_tail_seq.append(ent2id[item[2]])
|
||||
|
||||
masked_head_seq = set()
|
||||
masked_head_seq_id = set()
|
||||
masked_tail_seq = set()
|
||||
masked_tail_seq_id = set()
|
||||
|
||||
masked_tail_graph_list = masked_tail_neighbor["\t".join([line[0],line[1]])] if len(masked_tail_neighbor["\t".join([line[0],line[1]])]) < max_triplet else \
|
||||
random.sample(masked_tail_neighbor["\t".join([line[0],line[1]])], max_triplet)
|
||||
masked_head_graph_list = masked_head_neighbor["\t".join([line[2],line[1]])] if len(masked_head_neighbor["\t".join([line[2],line[1]])]) < max_triplet else \
|
||||
random.sample(masked_head_neighbor["\t".join([line[2],line[1]])], max_triplet)
|
||||
# masked_tail_graph_list = masked_tail_neighbor["\t".join([line[0],line[1]])][:16]
|
||||
# masked_head_graph_list = masked_head_neighbor["\t".join([line[2],line[1]])][:16]
|
||||
for item in masked_head_graph_list:
|
||||
masked_head_seq.add(item[0])
|
||||
masked_head_seq.add(item[1])
|
||||
masked_head_seq.add(item[2])
|
||||
masked_head_seq_id.add(ent2id[item[0]])
|
||||
masked_head_seq_id.add(rel2id[item[1]])
|
||||
masked_head_seq_id.add(ent2id[item[2]])
|
||||
|
||||
for item in masked_tail_graph_list:
|
||||
masked_tail_seq.add(item[0])
|
||||
masked_tail_seq.add(item[1])
|
||||
masked_tail_seq.add(item[2])
|
||||
masked_tail_seq_id.add(ent2id[item[0]])
|
||||
masked_tail_seq_id.add(rel2id[item[1]])
|
||||
masked_tail_seq_id.add(ent2id[item[2]])
|
||||
# print(masked_tail_seq)
|
||||
masked_head_seq = masked_head_seq.difference({line[0]})
|
||||
masked_head_seq = masked_head_seq.difference({line[2]})
|
||||
masked_head_seq = masked_head_seq.difference({line[1]})
|
||||
masked_head_seq_id = masked_head_seq_id.difference({ent2id[line[0]]})
|
||||
masked_head_seq_id = masked_head_seq_id.difference({rel2id[line[1]]})
|
||||
masked_head_seq_id = masked_head_seq_id.difference({ent2id[line[2]]})
|
||||
|
||||
masked_tail_seq = masked_tail_seq.difference({line[0]})
|
||||
masked_tail_seq = masked_tail_seq.difference({line[2]})
|
||||
masked_tail_seq = masked_tail_seq.difference({line[1]})
|
||||
masked_tail_seq_id = masked_tail_seq_id.difference({ent2id[line[0]]})
|
||||
masked_tail_seq_id = masked_tail_seq_id.difference({rel2id[line[1]]})
|
||||
masked_tail_seq_id = masked_tail_seq_id.difference({ent2id[line[2]]})
|
||||
# examples.append(
|
||||
# InputExample(guid=guid, text_a="[MASK]", text_b=' '.join(text_b.split(' ')[:16]) + " [PAD]", text_c = "[PAD]" + " " + ' '.join(text_c.split(' ')[:16]), text_d = masked_head_seq, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[rel2id[line[1]], ent2id[line[2]]], rel=rel2id[line[1]]))
|
||||
# examples.append(
|
||||
# InputExample(guid=guid, text_a="[PAD] ", text_b=' '.join(text_b.split(' ')[:16]) + " [PAD]", text_c = "[MASK]" +" " + ' '.join(text_a.split(' ')[:16]), text_d = masked_tail_seq, label=lmap(lambda x: ent2id[x], a), real_label=ent2id[line[2]], en=[ent2id[line[0]], rel2id[line[1]]], rel=rel2id[line[1]]))
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[MASK]", text_b="[PAD]", text_c = "[PAD]", text_d = list(masked_head_seq), label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[line[1], line[2]], en_id = [rel2id[line[1]], ent2id[line[2]]], rel=rel2id[line[1]], text_d_id = list(masked_head_seq_id), graph_inf = masked_head_graph_list))
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[PAD]", text_b="[PAD]", text_c = "[MASK]", text_d = list(masked_tail_seq), label=lmap(lambda x: ent2id[x], a), real_label=ent2id[line[2]], en=[line[0], line[1]], en_id = [ent2id[line[0]], rel2id[line[1]]], rel=rel2id[line[1]], text_d_id = list(masked_tail_seq_id), graph_inf = masked_tail_graph_list))
|
||||
return examples
|
||||
|
||||
def filter_init(head, tail, t1,t2, ent2id_, ent2token_, rel2id_, masked_head_neighbor_, masked_tail_neighbor_, rel2token_):
|
||||
global head_filter_entities
|
||||
global tail_filter_entities
|
||||
global ent2text
|
||||
global rel2text
|
||||
global ent2id
|
||||
global ent2token
|
||||
global rel2id
|
||||
global masked_head_neighbor
|
||||
global masked_tail_neighbor
|
||||
global rel2token
|
||||
|
||||
head_filter_entities = head
|
||||
tail_filter_entities = tail
|
||||
ent2text =t1
|
||||
rel2text =t2
|
||||
ent2id = ent2id_
|
||||
ent2token = ent2token_
|
||||
rel2id = rel2id_
|
||||
masked_head_neighbor = masked_head_neighbor_
|
||||
masked_tail_neighbor = masked_tail_neighbor_
|
||||
rel2token = rel2token_
|
||||
|
||||
def delete_init(ent2text_):
|
||||
global ent2text
|
||||
ent2text = ent2text_
|
||||
|
||||
|
||||
class KGProcessor(DataProcessor):
|
||||
"""Processor for knowledge graph data set."""
|
||||
def __init__(self, tokenizer, args):
|
||||
self.labels = set()
|
||||
self.tokenizer = tokenizer
|
||||
self.args = args
|
||||
self.entity_path = os.path.join(args.data_dir, "entity2textlong.txt") if os.path.exists(os.path.join(args.data_dir, 'entity2textlong.txt')) \
|
||||
else os.path.join(args.data_dir, "entity2text.txt")
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
return self._create_examples(
|
||||
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", data_dir, self.args)
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
return self._create_examples(
|
||||
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", data_dir, self.args)
|
||||
|
||||
def get_test_examples(self, data_dir, chunk=""):
|
||||
"""See base class."""
|
||||
return self._create_examples(
|
||||
self._read_tsv(os.path.join(data_dir, f"test{chunk}.tsv")), "test", data_dir, self.args)
|
||||
|
||||
def get_relations(self, data_dir):
|
||||
"""Gets all labels (relations) in the knowledge graph."""
|
||||
# return list(self.labels)
|
||||
with open(os.path.join(data_dir, "relations.txt"), 'r') as f:
|
||||
lines = f.readlines()
|
||||
relations = []
|
||||
for line in lines:
|
||||
relations.append(line.strip().split('\t')[0])
|
||||
rel2token = {ent : f"[RELATION_{i}]" for i, ent in enumerate(relations)}
|
||||
return list(rel2token.values())
|
||||
|
||||
def get_labels(self, data_dir):
|
||||
"""Gets all labels (0, 1) for triples in the knowledge graph."""
|
||||
relation = []
|
||||
with open(os.path.join(data_dir, "relation2text.txt"), 'r') as f:
|
||||
lines = f.readlines()
|
||||
entities = []
|
||||
for line in lines:
|
||||
relation.append(line.strip().split("\t")[-1])
|
||||
return relation
|
||||
|
||||
def get_entities(self, data_dir):
|
||||
"""Gets all entities in the knowledge graph."""
|
||||
with open(self.entity_path, 'r') as f:
|
||||
lines = f.readlines()
|
||||
entities = []
|
||||
for line in lines:
|
||||
entities.append(line.strip().split("\t")[0])
|
||||
|
||||
ent2token = {ent : f"[ENTITY_{i}]" for i, ent in enumerate(entities)}
|
||||
return list(ent2token.values())
|
||||
|
||||
def get_train_triples(self, data_dir):
|
||||
"""Gets training triples."""
|
||||
return self._read_tsv(os.path.join(data_dir, "train.tsv"))
|
||||
|
||||
def get_dev_triples(self, data_dir):
|
||||
"""Gets validation triples."""
|
||||
return self._read_tsv(os.path.join(data_dir, "dev.tsv"))
|
||||
|
||||
def get_test_triples(self, data_dir, chunk=""):
|
||||
"""Gets test triples."""
|
||||
return self._read_tsv(os.path.join(data_dir, f"test{chunk}.tsv"))
|
||||
|
||||
def _create_examples(self, lines, set_type, data_dir, args):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
# entity to text
|
||||
ent2text = {}
|
||||
ent2text_with_type = {}
|
||||
with open(self.entity_path, 'r') as f:
|
||||
ent_lines = f.readlines()
|
||||
for line in ent_lines:
|
||||
temp = line.strip().split('\t')
|
||||
try:
|
||||
end = temp[1]#.find(',')
|
||||
if "wiki" in data_dir:
|
||||
assert "Q" in temp[0]
|
||||
ent2text[temp[0]] = temp[1].replace("\\n", " ").replace("\\", "") #[:end]
|
||||
except IndexError:
|
||||
# continue
|
||||
end = " "#.find(',')
|
||||
if "wiki" in data_dir:
|
||||
assert "Q" in temp[0]
|
||||
ent2text[temp[0]] = end #[:end]
|
||||
|
||||
entities = list(ent2text.keys())
|
||||
ent2token = {ent : f"[ENTITY_{i}]" for i, ent in enumerate(entities)}
|
||||
ent2id = {ent : i for i, ent in enumerate(entities)}
|
||||
|
||||
rel2text = {}
|
||||
with open(os.path.join(data_dir, "relation2text.txt"), 'r') as f:
|
||||
rel_lines = f.readlines()
|
||||
for line in rel_lines:
|
||||
temp = line.strip().split('\t')
|
||||
rel2text[temp[0]] = temp[1]
|
||||
|
||||
relation_names = {}
|
||||
with open(os.path.join(data_dir, "relations.txt"), "r") as file:
|
||||
for line in file.readlines():
|
||||
t = line.strip()
|
||||
relation_names[t] = rel2text[t]
|
||||
|
||||
tmp_lines = []
|
||||
not_in_text = 0
|
||||
for line in tqdm(lines, desc="delete entities without text name."):
|
||||
if (line[0] not in ent2text) or (line[2] not in ent2text) or (line[1] not in rel2text):
|
||||
not_in_text += 1
|
||||
continue
|
||||
tmp_lines.append(line)
|
||||
lines = tmp_lines
|
||||
print(f"total entity not in text : {not_in_text} ")
|
||||
|
||||
relations = list(rel2text.keys())
|
||||
rel2token = {rel : f"[RELATION_{i}]" for i, rel in enumerate(relations)}
|
||||
# rel id -> relation token id
|
||||
num_entities = len(self.get_entities(args.data_dir))
|
||||
rel2id = {w:i+num_entities for i,w in enumerate(relation_names.keys())}
|
||||
|
||||
|
||||
with open(os.path.join(data_dir, "masked_head_neighbor.txt"), 'r') as file:
|
||||
masked_head_neighbor = json.load(file)
|
||||
|
||||
with open(os.path.join(data_dir, "masked_tail_neighbor.txt"), 'r') as file:
|
||||
masked_tail_neighbor = json.load(file)
|
||||
|
||||
examples = []
|
||||
# head filter head entity
|
||||
head_filter_entities = defaultdict(list)
|
||||
tail_filter_entities = defaultdict(list)
|
||||
|
||||
dataset_list = ["train.tsv", "dev.tsv", "test.tsv"]
|
||||
# in training, only use the train triples
|
||||
if set_type == "train" and not args.pretrain: dataset_list = dataset_list[0:1]
|
||||
for m in dataset_list:
|
||||
with open(os.path.join(data_dir, m), 'r') as file:
|
||||
train_lines = file.readlines()
|
||||
for idx in range(len(train_lines)):
|
||||
train_lines[idx] = train_lines[idx].strip().split("\t")
|
||||
|
||||
for line in train_lines:
|
||||
tail_filter_entities["\t".join([line[0], line[1]])].append(line[2])
|
||||
head_filter_entities["\t".join([line[2], line[1]])].append(line[0])
|
||||
|
||||
max_head_entities = max(len(_) for _ in head_filter_entities.values())
|
||||
max_tail_entities = max(len(_) for _ in tail_filter_entities.values())
|
||||
|
||||
# use bce loss, ignore the mlm
|
||||
if set_type == "train" and args.bce:
|
||||
lines = []
|
||||
for k, v in tail_filter_entities.items():
|
||||
h, r = k.split('\t')
|
||||
t = v[0]
|
||||
lines.append([h, r, t])
|
||||
for k, v in head_filter_entities.items():
|
||||
t, r = k.split('\t')
|
||||
h = v[0]
|
||||
lines.append([h, r, t])
|
||||
|
||||
|
||||
# for training , select each entity as for get mask embedding.
|
||||
if args.pretrain:
|
||||
rel = list(rel2text.keys())[0]
|
||||
lines = []
|
||||
for k in ent2text.keys():
|
||||
lines.append([k, rel, k])
|
||||
|
||||
print(f"max number of filter entities : {max_head_entities} {max_tail_entities}")
|
||||
# 把子图信息加入到filter_init中(初始化为文件夹,及固定子图),设置为全局变量,solve中调用
|
||||
from os import cpu_count
|
||||
threads = min(1, cpu_count())
|
||||
filter_init(head_filter_entities, tail_filter_entities,ent2text, rel2text, ent2id, ent2token, rel2id, masked_head_neighbor, masked_tail_neighbor, rel2token
|
||||
)
|
||||
|
||||
if hasattr(args, "faiss_init") and args.faiss_init:
|
||||
annotate_ = partial(
|
||||
solve_get_knowledge_store,
|
||||
pretrain=self.args.pretrain
|
||||
)
|
||||
else:
|
||||
annotate_ = partial(
|
||||
solve,
|
||||
pretrain=self.args.pretrain,
|
||||
max_triplet=self.args.max_triplet
|
||||
)
|
||||
examples = list(
|
||||
tqdm(
|
||||
map(annotate_, lines),
|
||||
total=len(lines),
|
||||
desc="convert text to examples"
|
||||
)
|
||||
)
|
||||
|
||||
tmp_examples = []
|
||||
for e in examples:
|
||||
for ee in e:
|
||||
tmp_examples.append(ee)
|
||||
examples = tmp_examples
|
||||
# delete vars
|
||||
del head_filter_entities, tail_filter_entities, ent2text, rel2text, ent2id, ent2token, rel2id
|
||||
return examples
|
||||
|
||||
class Verbalizer(object):
|
||||
def __init__(self, args):
|
||||
if "WN18RR" in args.data_dir:
|
||||
self.mode = "WN18RR"
|
||||
elif "FB15k" in args.data_dir:
|
||||
self.mode = "FB15k"
|
||||
elif "umls" in args.data_dir:
|
||||
self.mode = "umls"
|
||||
elif "codexs" in args.data_dir:
|
||||
self.mode = "codexs"
|
||||
elif "FB13" in args.data_dir:
|
||||
self.mode = "FB13"
|
||||
elif "WN11" in args.data_dir:
|
||||
self.mode = "WN11"
|
||||
|
||||
|
||||
def _convert(self, head, relation, tail):
|
||||
if self.mode == "umls":
|
||||
return f"The {relation} {head} is "
|
||||
|
||||
return f"{head} {relation}"
|
||||
|
||||
|
||||
class KGCDataset(Dataset):
|
||||
def __init__(self, features):
|
||||
self.features = features
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.features[index]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.features)
|
||||
|
||||
def convert_examples_to_features_init(tokenizer_for_convert):
|
||||
global tokenizer
|
||||
tokenizer = tokenizer_for_convert
|
||||
|
||||
def convert_examples_to_features(example, max_seq_length, mode, pretrain=1):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
text_a = " ".join(example.text_a.split()[:128])
|
||||
text_b = " ".join(example.text_b.split()[:128])
|
||||
text_c = " ".join(example.text_c.split()[:128])
|
||||
|
||||
if pretrain:
|
||||
input_text_a = text_a
|
||||
input_text_b = text_b
|
||||
else:
|
||||
input_text_a = " ".join([text_a, text_b])
|
||||
input_text_b = text_c
|
||||
|
||||
|
||||
inputs = tokenizer(
|
||||
input_text_a,
|
||||
input_text_b,
|
||||
truncation="longest_first",
|
||||
max_length=max_seq_length,
|
||||
padding="longest",
|
||||
add_special_tokens=True,
|
||||
)
|
||||
# assert tokenizer.mask_token_id in inputs.input_ids, "mask token must in input"
|
||||
|
||||
features = asdict(InputFeatures(input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs['attention_mask'],
|
||||
labels=torch.tensor(example.label),
|
||||
label=torch.tensor(example.real_label)
|
||||
)
|
||||
)
|
||||
return features
|
||||
|
||||
|
||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
||||
"""Truncates a sequence pair in place to the maximum length."""
|
||||
|
||||
# This is a simple heuristic which will always truncate the longer sequence
|
||||
# one token at a time. This makes more sense than truncating an equal percent
|
||||
# of tokens from each, since if one sequence is very short then each token
|
||||
# that's truncated likely contains more information than a longer sequence.
|
||||
while True:
|
||||
total_length = len(tokens_a) + len(tokens_b)
|
||||
if total_length <= max_length:
|
||||
break
|
||||
if len(tokens_a) > len(tokens_b):
|
||||
tokens_a.pop()
|
||||
else:
|
||||
tokens_b.pop()
|
||||
|
||||
def _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length):
|
||||
"""Truncates a sequence triple in place to the maximum length."""
|
||||
|
||||
# This is a simple heuristic which will always truncate the longer sequence
|
||||
# one token at a time. This makes more sense than truncating an equal percent
|
||||
# of tokens from each, since if one sequence is very short then each token
|
||||
# that's truncated likely contains more information than a longer sequence.
|
||||
while True:
|
||||
total_length = len(tokens_a) + len(tokens_b) + len(tokens_c)
|
||||
if total_length <= max_length:
|
||||
break
|
||||
if len(tokens_a) > len(tokens_b) and len(tokens_a) > len(tokens_c):
|
||||
tokens_a.pop()
|
||||
elif len(tokens_b) > len(tokens_a) and len(tokens_b) > len(tokens_c):
|
||||
tokens_b.pop()
|
||||
elif len(tokens_c) > len(tokens_a) and len(tokens_c) > len(tokens_b):
|
||||
tokens_c.pop()
|
||||
else:
|
||||
tokens_c.pop()
|
||||
|
||||
|
||||
@cache_results(_cache_fp="./dataset")
|
||||
def get_dataset(args, processor, label_list, tokenizer, mode):
|
||||
|
||||
assert mode in ["train", "dev", "test"], "mode must be in train dev test!"
|
||||
|
||||
# use training data to construct the entity embedding
|
||||
combine_train_and_test = False
|
||||
if args.faiss_init and mode == "test" and not args.pretrain:
|
||||
mode = "train"
|
||||
if "ind" in args.data_dir: combine_train_and_test = True
|
||||
else:
|
||||
pass
|
||||
|
||||
if mode == "train":
|
||||
train_examples = processor.get_train_examples(args.data_dir)
|
||||
elif mode == "dev":
|
||||
train_examples = processor.get_dev_examples(args.data_dir)
|
||||
else:
|
||||
train_examples = processor.get_test_examples(args.data_dir)
|
||||
|
||||
if combine_train_and_test:
|
||||
logger.info("use all the dataset for getting the entity mask embedding in pretraining pretraining")
|
||||
logger.info("use all the dataset for getting the entity mask embedding in pretraining pretraining")
|
||||
train_examples = processor.get_test_examples(args.data_dir) + processor.get_train_examples(args.data_dir) + processor.get_dev_examples(args.data_dir)
|
||||
|
||||
from os import cpu_count
|
||||
with open(os.path.join(args.data_dir, f"examples_{mode}.txt"), 'w') as file:
|
||||
for line in train_examples:
|
||||
d = {}
|
||||
d.update(line.__dict__)
|
||||
file.write(json.dumps(d) + '\n')
|
||||
|
||||
# 这里应该不需要重新from_pretrain,必须沿用加入token的
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=False)
|
||||
|
||||
features = []
|
||||
|
||||
file_inputs = [os.path.join(args.data_dir, f"examples_{mode}.txt")]
|
||||
file_outputs = [os.path.join(args.data_dir, f"features_{mode}.txt")]
|
||||
|
||||
with contextlib.ExitStack() as stack:
|
||||
inputs = [
|
||||
stack.enter_context(open(input, "r", encoding="utf-8"))
|
||||
if input != "-" else sys.stdin
|
||||
for input in file_inputs
|
||||
]
|
||||
outputs = [
|
||||
stack.enter_context(open(output, "w", encoding="utf-8"))
|
||||
if output != "-" else sys.stdout
|
||||
for output in file_outputs
|
||||
]
|
||||
|
||||
encoder = MultiprocessingEncoder(tokenizer, args)
|
||||
pool = Pool(16, initializer=encoder.initializer)
|
||||
encoder.initializer()
|
||||
encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 1000)
|
||||
# encoded_lines = map(encoder.encode_lines, zip(*inputs))
|
||||
|
||||
stats = Counter()
|
||||
for i, (filt, enc_lines) in tqdm(enumerate(encoded_lines, start=1), total=len(train_examples)):
|
||||
if filt == "PASS":
|
||||
for enc_line, output_h in zip(enc_lines, outputs):
|
||||
features.append(eval(enc_line))
|
||||
# features.append(enc_line)
|
||||
# print(enc_line, file=output_h)
|
||||
else:
|
||||
stats["num_filtered_" + filt] += 1
|
||||
|
||||
for k, v in stats.most_common():
|
||||
print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
|
||||
|
||||
for f_id, f in enumerate(features):
|
||||
en = features[f_id].pop("en")
|
||||
rel = features[f_id].pop("rel")
|
||||
graph = features[f_id].pop("graph")
|
||||
real_label = f['label']
|
||||
features[f_id]['distance_attention'] = torch.Tensor(features[f_id]['distance_attention'])
|
||||
|
||||
cnt = 0
|
||||
cnt_2 = 0
|
||||
if not isinstance(en, list): break
|
||||
|
||||
pos = 0
|
||||
for i,t in enumerate(f['input_ids']):
|
||||
if t == tokenizer.pad_token_id:
|
||||
features[f_id]['input_ids'][i] = en[cnt] + len(tokenizer)
|
||||
cnt += 1
|
||||
if t == tokenizer.unk_token_id:
|
||||
features[f_id]['input_ids'][i] = graph[cnt_2] + len(tokenizer)
|
||||
cnt_2 += 1
|
||||
if features[f_id]['input_ids'][i] == real_label + len(tokenizer):
|
||||
pos = i
|
||||
if cnt_2 == len(graph) and cnt == len(en): break
|
||||
# 如果等于UNK, pop出图节点list,然后替换
|
||||
assert not (args.faiss_init and pos == 0)
|
||||
features[f_id]['pos'] = pos
|
||||
|
||||
# for i,t in enumerate(f['input_ids']):
|
||||
# if t == tokenizer.pad_token_id:
|
||||
# features[f_id]['input_ids'][i] = rel + len(tokenizer) + num_entities
|
||||
# break
|
||||
|
||||
|
||||
|
||||
features = KGCDataset(features)
|
||||
return features
|
||||
|
||||
|
||||
class MultiprocessingEncoder(object):
|
||||
def __init__(self, tokenizer, args):
|
||||
self.tokenizer = tokenizer
|
||||
self.pretrain = args.pretrain
|
||||
self.max_seq_length = args.max_seq_length
|
||||
|
||||
def initializer(self):
|
||||
global bpe
|
||||
bpe = self.tokenizer
|
||||
|
||||
def encode(self, line):
|
||||
global bpe
|
||||
ids = bpe.encode(line)
|
||||
return list(map(str, ids))
|
||||
|
||||
def decode(self, tokens):
|
||||
global bpe
|
||||
return bpe.decode(tokens)
|
||||
|
||||
def encode_lines(self, lines):
|
||||
"""
|
||||
Encode a set of lines. All lines will be encoded together.
|
||||
"""
|
||||
enc_lines = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if len(line) == 0:
|
||||
return ["EMPTY", None]
|
||||
# enc_lines.append(" ".join(tokens))
|
||||
enc_lines.append(json.dumps(self.convert_examples_to_features(example=eval(line))))
|
||||
# enc_lines.append(" ")
|
||||
# enc_lines.append("123")
|
||||
return ["PASS", enc_lines]
|
||||
|
||||
def decode_lines(self, lines):
|
||||
dec_lines = []
|
||||
for line in lines:
|
||||
tokens = map(int, line.strip().split())
|
||||
dec_lines.append(self.decode(tokens))
|
||||
return ["PASS", dec_lines]
|
||||
|
||||
def convert_examples_to_features(self, example):
|
||||
pretrain = self.pretrain
|
||||
max_seq_length = self.max_seq_length
|
||||
global bpe
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
# tokens_a = tokenizer.tokenize(example.text_a)
|
||||
# tokens_b = tokenizer.tokenize(example.text_b)
|
||||
# tokens_c = tokenizer.tokenize(example.text_c)
|
||||
|
||||
# _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length= max_seq_length)
|
||||
# text_a = " ".join(example['text_a'].split()[:128])
|
||||
# text_b = " ".join(example['text_b'].split()[:128])
|
||||
# text_c = " ".join(example['text_c'].split()[:128])
|
||||
|
||||
text_a = example['text_a']
|
||||
text_b = example['text_b']
|
||||
text_c = example['text_c']
|
||||
text_d = example['text_d']
|
||||
graph_list = example['graph_inf']
|
||||
|
||||
if pretrain:
|
||||
# the des of xxx is [MASK] .
|
||||
input_text = f"The description of {text_a} is that {text_b} ."
|
||||
inputs = bpe(
|
||||
input_text,
|
||||
truncation="longest_first",
|
||||
max_length=max_seq_length,
|
||||
padding="longest",
|
||||
add_special_tokens=True,
|
||||
)
|
||||
else:
|
||||
if text_a == "[MASK]":
|
||||
input_text_a = " ".join([text_a, text_b])
|
||||
input_text_b = text_c
|
||||
origin_triplet = ["MASK"] + example['en']
|
||||
graph_seq = ["MASK"] + example['en'] + text_d
|
||||
else:
|
||||
input_text_a = text_a
|
||||
input_text_b = " ".join([text_b, text_c])
|
||||
origin_triplet = example['en'] + ["MASK"]
|
||||
graph_seq = example['en'] + ["MASK"] + text_d
|
||||
# 加入graph信息, 拼接等量[UNK]
|
||||
input_text_b = " ".join(["[CLS]", input_text_a, input_text_b, bpe.unk_token * len(text_d)])
|
||||
|
||||
inputs = bpe(
|
||||
input_text_b,
|
||||
truncation="longest_first",
|
||||
max_length=max_seq_length,
|
||||
padding="longest",
|
||||
add_special_tokens=False,
|
||||
)
|
||||
# assert bpe.mask_token_id in inputs.input_ids, "mask token must in input"
|
||||
|
||||
# graph_seq = input_text_b[] 把图结构信息读取出来
|
||||
# [CLS] [ENTITY_13258] [RELATION_68] [MASK] [ENTITY_4] [RELATION_127] [ENTITY_8] [RELATION_9] [ENTITY_9011] [ENTITY_12477] [PAD] [PAD]
|
||||
# 获取图结构信息
|
||||
# 首先在solve中加入一个存储所有子图三元组的临时存储变量
|
||||
# 在这里graph_information = example['graph']
|
||||
new_rel = set()
|
||||
new_rel.add(tuple((origin_triplet[0], origin_triplet[1])))
|
||||
new_rel.add(tuple((origin_triplet[1], origin_triplet[0])))
|
||||
new_rel.add(tuple((origin_triplet[1], origin_triplet[2])))
|
||||
new_rel.add(tuple((origin_triplet[2], origin_triplet[1])))
|
||||
for triplet in graph_list:
|
||||
rel1, rel2, rel3, rel4 = tuple((triplet[0], triplet[1])), tuple((triplet[1], triplet[2])), tuple((triplet[1], triplet[0])), tuple((triplet[2], triplet[1]))
|
||||
new_rel.add(rel1)
|
||||
new_rel.add(rel2)
|
||||
new_rel.add(rel3)
|
||||
new_rel.add(rel4)
|
||||
# 这里的三元组转换为new_rel
|
||||
KGid2Graphid_map = defaultdict(int)
|
||||
for i in range(len(graph_seq)):
|
||||
KGid2Graphid_map[graph_seq[i]] = i
|
||||
|
||||
N = len(graph_seq)
|
||||
adj = torch.zeros([N, N], dtype=torch.bool)
|
||||
for item in list(new_rel):
|
||||
adj[KGid2Graphid_map[item[0]], KGid2Graphid_map[item[1]]] = True
|
||||
shortest_path_result, _ = algos.floyd_warshall(adj.numpy())
|
||||
max_dist = np.amax(shortest_path_result)
|
||||
# [PAD]部分, [CLS]部分补全, [SEP]额外引入也当作[PAD]处理
|
||||
# 加上一个attention_bias, PAD部分设置为-inf,在送入model前,对其进行处理, 将其相加(让模型无法关注PAD)
|
||||
|
||||
# 加入attention到huggingface的BertForMaskedLM(这个可能需要再去查查)
|
||||
# attention_bias = torch.zero(N, N, dtype=torch.float)
|
||||
# attention_bias[torch.tensor(shortest_path_result == )]
|
||||
features = asdict(InputFeatures(input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs['attention_mask'],
|
||||
labels=example['label'],
|
||||
label=example['real_label'],
|
||||
en=example['en_id'],
|
||||
rel=example['rel'],
|
||||
graph=example['text_d_id'],
|
||||
distance_attention = shortest_path_result.tolist(),
|
||||
)
|
||||
)
|
||||
return features
|
@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
17535
dataset/FB15k-237/dev.tsv
Normal file
17535
dataset/FB15k-237/dev.tsv
Normal file
File diff suppressed because it is too large
Load Diff
14541
dataset/FB15k-237/entities.txt
Normal file
14541
dataset/FB15k-237/entities.txt
Normal file
File diff suppressed because it is too large
Load Diff
14951
dataset/FB15k-237/entity2text.txt
Normal file
14951
dataset/FB15k-237/entity2text.txt
Normal file
File diff suppressed because it is too large
Load Diff
14951
dataset/FB15k-237/entity2textlong.txt
Normal file
14951
dataset/FB15k-237/entity2textlong.txt
Normal file
File diff suppressed because it is too large
Load Diff
0
dataset/FB15k-237/features_dev.txt
Normal file
0
dataset/FB15k-237/features_dev.txt
Normal file
0
dataset/FB15k-237/features_test.txt
Normal file
0
dataset/FB15k-237/features_test.txt
Normal file
0
dataset/FB15k-237/features_train.txt
Normal file
0
dataset/FB15k-237/features_train.txt
Normal file
155
dataset/FB15k-237/get_neighbor.ipynb
Normal file
155
dataset/FB15k-237/get_neighbor.ipynb
Normal file
@ -0,0 +1,155 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path1 = './entities.txt'\n",
|
||||
"path2 = './relations.txt'\n",
|
||||
"path3 = './train.tsv'\n",
|
||||
"path4 = './dev.tsv'\n",
|
||||
"path5 = './test.tsv'\n",
|
||||
"path6 = './get_neighbor/entity2id.txt'\n",
|
||||
"path7 = './get_neighbor/relation2id.txt'\n",
|
||||
"path8 = './get_neighbor/train2id.txt'\n",
|
||||
"path9 = './get_neighbor/valid2id.txt'\n",
|
||||
"path10 = './get_neighbor/test2id.txt'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path1, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"cnt = 0\n",
|
||||
"with open(path6, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" en = line.strip()\n",
|
||||
" f.write(en + '\\t' + str(cnt) + '\\n')\n",
|
||||
" cnt += 1\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path2, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"cnt = 0\n",
|
||||
"with open(path7, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" re = line.strip()\n",
|
||||
" f.write(re + '\\t' + str(cnt) + '\\n')\n",
|
||||
" cnt += 1\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path6, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"en2id = {}\n",
|
||||
"for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" en, num = b[0], b[1]\n",
|
||||
" en2id[en] = num"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path7, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"re2id = {}\n",
|
||||
"for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" re, num = b[0], b[1]\n",
|
||||
" re2id[re] = num"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path3, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"with open(path8, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" h, r, t = b[0], b[1], b[2]\n",
|
||||
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path4, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"with open(path9, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" h, r, t = b[0], b[1], b[2]\n",
|
||||
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path5, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"with open(path10, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" h, r, t = b[0], b[1], b[2]\n",
|
||||
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
|
||||
" "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python [default]",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
14541
dataset/FB15k-237/get_neighbor/entity2id.txt
Normal file
14541
dataset/FB15k-237/get_neighbor/entity2id.txt
Normal file
File diff suppressed because it is too large
Load Diff
237
dataset/FB15k-237/get_neighbor/relation2id.txt
Normal file
237
dataset/FB15k-237/get_neighbor/relation2id.txt
Normal file
@ -0,0 +1,237 @@
|
||||
/soccer/football_team/current_roster./soccer/football_roster_position/position 0
|
||||
/music/artist/origin 1
|
||||
/ice_hockey/hockey_team/current_roster./sports/sports_team_roster/position 2
|
||||
/food/food/nutrients./food/nutrition_fact/nutrient 3
|
||||
/film/actor/film./film/performance/film 4
|
||||
/award/award_nominee/award_nominations./award/award_nomination/nominated_for 5
|
||||
/government/political_party/politicians_in_this_party./government/political_party_tenure/politician 6
|
||||
/base/schemastaging/person_extra/net_worth./measurement_unit/dated_money_value/currency 7
|
||||
/people/deceased_person/place_of_death 8
|
||||
/people/person/profession 9
|
||||
/location/administrative_division/first_level_division_of 10
|
||||
/base/marchmadness/ncaa_basketball_tournament/seeds./base/marchmadness/ncaa_tournament_seed/team 11
|
||||
/education/university/international_tuition./measurement_unit/dated_money_value/currency 12
|
||||
/location/us_county/county_seat 13
|
||||
/location/location/partially_contains 14
|
||||
/tv/tv_program/program_creator 15
|
||||
/film/film/music 16
|
||||
/tv/tv_program/languages 17
|
||||
/common/topic/webpage./common/webpage/category 18
|
||||
/user/tsegaran/random/taxonomy_subject/entry./user/tsegaran/random/taxonomy_entry/taxonomy 19
|
||||
/education/field_of_study/students_majoring./education/education/major_field_of_study 20
|
||||
/business/business_operation/assets./measurement_unit/dated_money_value/currency 21
|
||||
/film/film_set_designer/film_sets_designed 22
|
||||
/dataworld/gardening_hint/split_to 23
|
||||
/people/person/languages 24
|
||||
/business/job_title/people_with_this_title./business/employment_tenure/company 25
|
||||
/location/country/form_of_government 26
|
||||
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_language 27
|
||||
/people/person/place_of_birth 28
|
||||
/sports/sports_team/colors 29
|
||||
/education/educational_institution/school_type 30
|
||||
/award/award_category/winners./award/award_honor/award_winner 31
|
||||
/organization/organization/headquarters./location/mailing_address/citytown 32
|
||||
/education/educational_degree/people_with_this_degree./education/education/student 33
|
||||
/government/legislative_session/members./government/government_position_held/legislative_sessions 34
|
||||
/film/film/distributors./film/film_film_distributor_relationship/film_distribution_medium 35
|
||||
/education/educational_degree/people_with_this_degree./education/education/major_field_of_study 36
|
||||
/location/hud_county_place/county 37
|
||||
/location/administrative_division/country 38
|
||||
/film/film/film_production_design_by 39
|
||||
/award/award_winning_work/awards_won./award/award_honor/award 40
|
||||
/organization/organization/headquarters./location/mailing_address/state_province_region 41
|
||||
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/contact_category 42
|
||||
/tv/tv_program/country_of_origin 43
|
||||
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/medal 44
|
||||
/location/country/second_level_divisions 45
|
||||
/award/award_ceremony/awards_presented./award/award_honor/honored_for 46
|
||||
/organization/organization_member/member_of./organization/organization_membership/organization 47
|
||||
/education/educational_institution/campuses 48
|
||||
/music/artist/contribution./music/recording_contribution/performance_role 49
|
||||
/award/ranked_item/appears_in_ranked_lists./award/ranking/list 50
|
||||
/people/person/religion 51
|
||||
/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month 52
|
||||
/film/special_film_performance_type/film_performance_type./film/performance/film 53
|
||||
/award/award_nominee/award_nominations./award/award_nomination/award 54
|
||||
/location/statistical_region/religions./location/religion_percentage/religion 55
|
||||
/sports/sports_league_draft/picks./sports/sports_league_draft_pick/school 56
|
||||
/film/film/distributors./film/film_film_distributor_relationship/region 57
|
||||
/government/politician/government_positions_held./government/government_position_held/legislative_sessions 58
|
||||
/organization/role/leaders./organization/leadership/organization 59
|
||||
/tv/tv_network/programs./tv/tv_network_duration/program 60
|
||||
/soccer/football_team/current_roster./sports/sports_team_roster/position 61
|
||||
/music/instrument/instrumentalists 62
|
||||
/business/business_operation/operating_income./measurement_unit/dated_money_value/currency 63
|
||||
/people/cause_of_death/people 64
|
||||
/film/film/film_art_direction_by 65
|
||||
/people/person/sibling_s./people/sibling_relationship/sibling 66
|
||||
/film/film/cinematography 67
|
||||
/film/actor/dubbing_performances./film/dubbing_performance/language 68
|
||||
/base/biblioness/bibs_location/state 69
|
||||
/base/petbreeds/city_with_dogs/top_breeds./base/petbreeds/dog_city_relationship/dog_breed 70
|
||||
/people/person/gender 71
|
||||
/education/field_of_study/students_majoring./education/education/student 72
|
||||
/base/popstra/celebrity/dated./base/popstra/dated/participant 73
|
||||
/sports/sports_team/roster./american_football/football_roster_position/position 74
|
||||
/award/award_winner/awards_won./award/award_honor/award_winner 75
|
||||
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/olympics 76
|
||||
/film/director/film 77
|
||||
/tv/tv_producer/programs_produced./tv/tv_producer_term/program 78
|
||||
/film/film_distributor/films_distributed./film/film_film_distributor_relationship/film 79
|
||||
/olympics/olympic_games/sports 80
|
||||
/music/record_label/artist 81
|
||||
/education/university/local_tuition./measurement_unit/dated_money_value/currency 82
|
||||
/film/film/story_by 83
|
||||
/people/person/spouse_s./people/marriage/spouse 84
|
||||
/sports/sports_league/teams./sports/sports_league_participation/team 85
|
||||
/people/profession/specialization_of 86
|
||||
/base/americancomedy/celebrity_impressionist/celebrities_impersonated 87
|
||||
/tv/tv_program/genre 88
|
||||
/award/award_category/nominees./award/award_nomination/nominated_for 89
|
||||
/language/human_language/countries_spoken_in 90
|
||||
/organization/organization/headquarters./location/mailing_address/country 91
|
||||
/location/statistical_region/gdp_real./measurement_unit/adjusted_money_value/adjustment_currency 92
|
||||
/education/university/fraternities_and_sororities 93
|
||||
/award/award_nominee/award_nominations./award/award_nomination/award_nominee 94
|
||||
/military/military_combatant/military_conflicts./military/military_combatant_group/combatants 95
|
||||
/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for 96
|
||||
/location/location/time_zones 97
|
||||
/film/film/dubbing_performances./film/dubbing_performance/actor 98
|
||||
/film/film_subject/films 99
|
||||
/education/educational_degree/people_with_this_degree./education/education/institution 100
|
||||
/education/educational_institution/colors 101
|
||||
/award/award_category/category_of 102
|
||||
/tv/tv_personality/tv_regular_appearances./tv/tv_regular_personal_appearance/program 103
|
||||
/film/film/language 104
|
||||
/music/group_member/membership./music/group_membership/group 105
|
||||
/business/business_operation/revenue./measurement_unit/dated_money_value/currency 106
|
||||
/film/film/film_festivals 107
|
||||
/film/actor/film./film/performance/special_performance_type 108
|
||||
/organization/non_profit_organization/registered_with./organization/non_profit_registration/registering_agency 109
|
||||
/government/politician/government_positions_held./government/government_position_held/jurisdiction_of_office 110
|
||||
/base/aareas/schema/administrative_area/administrative_parent 111
|
||||
/award/award_winning_work/awards_won./award/award_honor/award_winner 112
|
||||
/organization/organization/place_founded 113
|
||||
/soccer/football_player/current_team./sports/sports_team_roster/team 114
|
||||
/government/politician/government_positions_held./government/government_position_held/basic_title 115
|
||||
/music/artist/track_contributions./music/track_contribution/role 116
|
||||
/base/localfood/seasonal_month/produce_available./base/localfood/produce_availability/seasonal_months 117
|
||||
/celebrities/celebrity/celebrity_friends./celebrities/friendship/friend 118
|
||||
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/school 119
|
||||
/award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee 120
|
||||
/influence/influence_node/peers./influence/peer_relationship/peers 121
|
||||
/medicine/disease/risk_factors 122
|
||||
/broadcast/content/artist 123
|
||||
/film/film/estimated_budget./measurement_unit/dated_money_value/currency 124
|
||||
/military/military_conflict/combatants./military/military_combatant_group/combatants 125
|
||||
/location/capital_of_administrative_division/capital_of./location/administrative_division_capital_relationship/administrative_division 126
|
||||
/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor 127
|
||||
/people/deceased_person/place_of_burial 128
|
||||
/location/location/adjoin_s./location/adjoining_relationship/adjoins 129
|
||||
/music/group_member/membership./music/group_membership/role 130
|
||||
/award/award_ceremony/awards_presented./award/award_honor/award_winner 131
|
||||
/film/film/prequel 132
|
||||
/film/film/produced_by 133
|
||||
/tv/tv_program/tv_producer./tv/tv_producer_term/producer_type 134
|
||||
/sports/sports_position/players./sports/sports_team_roster/team 135
|
||||
/olympics/olympic_games/participating_countries 136
|
||||
/music/genre/parent_genre 137
|
||||
/tv/tv_writer/tv_programs./tv/tv_program_writer_relationship/tv_program 138
|
||||
/music/genre/artists 139
|
||||
/film/film/genre 140
|
||||
/people/person/employment_history./business/employment_tenure/company 141
|
||||
/education/university/domestic_tuition./measurement_unit/dated_money_value/currency 142
|
||||
/people/person/nationality 143
|
||||
/location/country/capital 144
|
||||
/location/statistical_region/gni_per_capita_in_ppp_dollars./measurement_unit/dated_money_value/currency 145
|
||||
/base/aareas/schema/administrative_area/capital 146
|
||||
/business/business_operation/industry 147
|
||||
/location/hud_foreclosure_area/estimated_number_of_mortgages./measurement_unit/dated_integer/source 148
|
||||
/film/film/other_crew./film/film_crew_gig/crewmember 149
|
||||
/base/popstra/location/vacationers./base/popstra/vacation_choice/vacationer 150
|
||||
/film/film/film_format 151
|
||||
/medicine/disease/notable_people_with_this_condition 152
|
||||
/film/film/costume_design_by 153
|
||||
/government/government_office_category/officeholders./government/government_position_held/jurisdiction_of_office 154
|
||||
/location/statistical_region/gdp_nominal./measurement_unit/dated_money_value/currency 155
|
||||
/sports/sports_team/roster./baseball/baseball_roster_position/position 156
|
||||
/award/award_winning_work/awards_won./award/award_honor/honored_for 157
|
||||
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/olympics 158
|
||||
/celebrities/celebrity/sexual_relationships./celebrities/romantic_relationship/celebrity 159
|
||||
/people/marriage_union_type/unions_of_this_type./people/marriage/location_of_ceremony 160
|
||||
/organization/organization/child./organization/organization_relationship/child 161
|
||||
/organization/organization_founder/organizations_founded 162
|
||||
/sports/sports_team/sport 163
|
||||
/people/ethnicity/geographic_distribution 164
|
||||
/location/statistical_region/places_exported_to./location/imports_and_exports/exported_to 165
|
||||
/location/country/official_language 166
|
||||
/film/film/production_companies 167
|
||||
/user/jg/default_domain/olympic_games/sports 168
|
||||
/time/event/locations 169
|
||||
/people/person/spouse_s./people/marriage/type_of_union 170
|
||||
/government/governmental_body/members./government/government_position_held/legislative_sessions 171
|
||||
/media_common/netflix_genre/titles 172
|
||||
/user/alexander/philosophy/philosopher/interests 173
|
||||
/film/film/runtime./film/film_cut/film_release_region 174
|
||||
/education/educational_institution/students_graduates./education/education/student 175
|
||||
/base/eating/practicer_of_diet/diet 176
|
||||
/tv/non_character_role/tv_regular_personal_appearances./tv/tv_regular_personal_appearance/person 177
|
||||
/sports/sports_position/players./sports/sports_team_roster/position 178
|
||||
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/draft 179
|
||||
/medicine/symptom/symptom_of 180
|
||||
/film/person_or_entity_appearing_in_film/films./film/personal_film_appearance/type_of_appearance 181
|
||||
/sports/sports_team_location/teams 182
|
||||
/american_football/football_team/current_roster./sports/sports_team_roster/position 183
|
||||
/people/person/places_lived./people/place_lived/location 184
|
||||
/location/statistical_region/rent50_2./measurement_unit/dated_money_value/currency 185
|
||||
/film/film/personal_appearances./film/personal_film_appearance/person 186
|
||||
/music/instrument/family 187
|
||||
/sports/sports_team/roster./basketball/basketball_roster_position/position 188
|
||||
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_location 189
|
||||
/film/film/release_date_s./film/film_regional_release_date/film_release_region 190
|
||||
/award/award_category/disciplines_or_subjects 191
|
||||
/base/popstra/celebrity/friendship./base/popstra/friendship/participant 192
|
||||
/music/performance_role/regular_performances./music/group_membership/group 193
|
||||
/film/film/edited_by 194
|
||||
/base/x2010fifaworldcupsouthafrica/world_cup_squad/current_world_cup_squad./base/x2010fifaworldcupsouthafrica/current_world_cup_squad/current_club 195
|
||||
/base/popstra/celebrity/canoodled./base/popstra/canoodled/participant 196
|
||||
/film/film/release_date_s./film/film_regional_release_date/film_release_distribution_medium 197
|
||||
/film/film/other_crew./film/film_crew_gig/film_crew_role 198
|
||||
/base/popstra/celebrity/breakup./base/popstra/breakup/participant 199
|
||||
/film/film/country 200
|
||||
/music/performance_role/regular_performances./music/group_membership/role 201
|
||||
/sports/sports_team/roster./american_football/football_historical_roster_position/position_s 202
|
||||
/film/film/release_date_s./film/film_regional_release_date/film_regional_debut_venue 203
|
||||
/time/event/instance_of_recurring_event 204
|
||||
/olympics/olympic_participating_country/athletes./olympics/olympic_athlete_affiliation/olympics 205
|
||||
/organization/endowed_organization/endowment./measurement_unit/dated_money_value/currency 206
|
||||
/travel/travel_destination/how_to_get_here./travel/transportation/mode_of_transportation 207
|
||||
/baseball/baseball_team/team_stats./baseball/baseball_team_stats/season 208
|
||||
/award/award_category/winners./award/award_honor/ceremony 209
|
||||
/government/legislative_session/members./government/government_position_held/district_represented 210
|
||||
/influence/influence_node/influenced_by 211
|
||||
/base/culturalevent/event/entity_involved 212
|
||||
/people/ethnicity/people 213
|
||||
/sports/sport/pro_athletes./sports/pro_sports_played/athlete 214
|
||||
/location/statistical_region/gdp_nominal_per_capita./measurement_unit/dated_money_value/currency 215
|
||||
/location/hud_county_place/place 216
|
||||
/base/aareas/schema/administrative_area/administrative_area_type 217
|
||||
/base/locations/continents/countries_within 218
|
||||
/sports/sports_position/players./american_football/football_historical_roster_position/position_s 219
|
||||
/people/person/spouse_s./people/marriage/location_of_ceremony 220
|
||||
/education/educational_institution/students_graduates./education/education/major_field_of_study 221
|
||||
/film/film/written_by 222
|
||||
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/country 223
|
||||
/music/performance_role/guest_performances./music/recording_contribution/performance_role 224
|
||||
/film/film/featured_film_locations 225
|
||||
/education/educational_institution_campus/educational_institution 226
|
||||
/sports/pro_athlete/teams./sports/sports_team_roster/team 227
|
||||
/people/ethnicity/languages_spoken 228
|
||||
/film/film/executive_produced_by 229
|
||||
/tv/tv_producer/programs_produced./tv/tv_producer_term/producer_type 230
|
||||
/location/location/contains 231
|
||||
/base/biblioness/bibs_location/country 232
|
||||
/user/ktrueman/default_domain/international_organization/member_states 233
|
||||
/music/performance_role/track_performances./music/track_contribution/role 234
|
||||
/olympics/olympic_games/medals_awarded./olympics/olympic_medal_honor/medal 235
|
||||
/base/saturdaynightlive/snl_cast_member/seasons./base/saturdaynightlive/snl_season_tenure/cast_members 236
|
20466
dataset/FB15k-237/get_neighbor/test2id.txt
Normal file
20466
dataset/FB15k-237/get_neighbor/test2id.txt
Normal file
File diff suppressed because it is too large
Load Diff
272115
dataset/FB15k-237/get_neighbor/train2id.txt
Normal file
272115
dataset/FB15k-237/get_neighbor/train2id.txt
Normal file
File diff suppressed because it is too large
Load Diff
17535
dataset/FB15k-237/get_neighbor/valid2id.txt
Normal file
17535
dataset/FB15k-237/get_neighbor/valid2id.txt
Normal file
File diff suppressed because it is too large
Load Diff
237
dataset/FB15k-237/relation2text.txt
Normal file
237
dataset/FB15k-237/relation2text.txt
Normal file
@ -0,0 +1,237 @@
|
||||
/soccer/football_team/current_roster./soccer/football_roster_position/position soccer football team current roster. soccer football roster position position
|
||||
/music/artist/origin music artist origin
|
||||
/ice_hockey/hockey_team/current_roster./sports/sports_team_roster/position ice hockey hockey team current roster. sports sports team roster position
|
||||
/food/food/nutrients./food/nutrition_fact/nutrient food food nutrients. food nutrition fact nutrient
|
||||
/film/actor/film./film/performance/film film actor film. film performance film
|
||||
/award/award_nominee/award_nominations./award/award_nomination/nominated_for award award nominee award nominations. award award nomination nominated for
|
||||
/government/political_party/politicians_in_this_party./government/political_party_tenure/politician government political party politicians in this party. government political party tenure politician
|
||||
/base/schemastaging/person_extra/net_worth./measurement_unit/dated_money_value/currency base schemastaging person extra net worth. measurement unit dated money value currency
|
||||
/people/deceased_person/place_of_death people deceased person place of death
|
||||
/people/person/profession people person profession
|
||||
/location/administrative_division/first_level_division_of location administrative division first level division of
|
||||
/base/marchmadness/ncaa_basketball_tournament/seeds./base/marchmadness/ncaa_tournament_seed/team base marchmadness ncaa basketball tournament seeds. base marchmadness ncaa tournament seed team
|
||||
/education/university/international_tuition./measurement_unit/dated_money_value/currency education university international tuition. measurement unit dated money value currency
|
||||
/location/us_county/county_seat location us county county seat
|
||||
/location/location/partially_contains location location partially contains
|
||||
/tv/tv_program/program_creator tv tv program program creator
|
||||
/film/film/music film film music
|
||||
/tv/tv_program/languages tv tv program languages
|
||||
/common/topic/webpage./common/webpage/category common topic webpage. common webpage category
|
||||
/user/tsegaran/random/taxonomy_subject/entry./user/tsegaran/random/taxonomy_entry/taxonomy user tsegaran random taxonomy subject entry. user tsegaran random taxonomy entry taxonomy
|
||||
/education/field_of_study/students_majoring./education/education/major_field_of_study education field of study students majoring. education education major field of study
|
||||
/business/business_operation/assets./measurement_unit/dated_money_value/currency business business operation assets. measurement unit dated money value currency
|
||||
/film/film_set_designer/film_sets_designed film film set designer film sets designed
|
||||
/dataworld/gardening_hint/split_to dataworld gardening hint split to
|
||||
/people/person/languages people person languages
|
||||
/business/job_title/people_with_this_title./business/employment_tenure/company business job title people with this title. business employment tenure company
|
||||
/location/country/form_of_government location country form of government
|
||||
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_language base schemastaging organization extra phone number. base schemastaging phone sandbox service language
|
||||
/people/person/place_of_birth people person place of birth
|
||||
/sports/sports_team/colors sports sports team colors
|
||||
/education/educational_institution/school_type education educational institution school type
|
||||
/award/award_category/winners./award/award_honor/award_winner award award category winners. award award honor award winner
|
||||
/organization/organization/headquarters./location/mailing_address/citytown organization organization headquarters. location mailing address citytown
|
||||
/education/educational_degree/people_with_this_degree./education/education/student education educational degree people with this degree. education education student
|
||||
/government/legislative_session/members./government/government_position_held/legislative_sessions government legislative session members. government government position held legislative sessions
|
||||
/film/film/distributors./film/film_film_distributor_relationship/film_distribution_medium film film distributors. film film film distributor relationship film distribution medium
|
||||
/education/educational_degree/people_with_this_degree./education/education/major_field_of_study education educational degree people with this degree. education education major field of study
|
||||
/location/hud_county_place/county location hud county place county
|
||||
/location/administrative_division/country location administrative division country
|
||||
/film/film/film_production_design_by film film film production design by
|
||||
/award/award_winning_work/awards_won./award/award_honor/award award award winning work awards won. award award honor award
|
||||
/organization/organization/headquarters./location/mailing_address/state_province_region organization organization headquarters. location mailing address state province region
|
||||
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/contact_category base schemastaging organization extra phone number. base schemastaging phone sandbox contact category
|
||||
/tv/tv_program/country_of_origin tv tv program country of origin
|
||||
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/medal olympics olympic participating country medals won. olympics olympic medal honor medal
|
||||
/location/country/second_level_divisions location country second level divisions
|
||||
/award/award_ceremony/awards_presented./award/award_honor/honored_for award award ceremony awards presented. award award honor honored for
|
||||
/organization/organization_member/member_of./organization/organization_membership/organization organization organization member member of. organization organization membership organization
|
||||
/education/educational_institution/campuses education educational institution campuses
|
||||
/music/artist/contribution./music/recording_contribution/performance_role music artist contribution. music recording contribution performance role
|
||||
/award/ranked_item/appears_in_ranked_lists./award/ranking/list award ranked item appears in ranked lists. award ranking list
|
||||
/people/person/religion people person religion
|
||||
/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month travel travel destination climate. travel travel destination monthly climate month
|
||||
/film/special_film_performance_type/film_performance_type./film/performance/film film special film performance type film performance type. film performance film
|
||||
/award/award_nominee/award_nominations./award/award_nomination/award award award nominee award nominations. award award nomination award
|
||||
/location/statistical_region/religions./location/religion_percentage/religion location statistical region religions. location religion percentage religion
|
||||
/sports/sports_league_draft/picks./sports/sports_league_draft_pick/school sports sports league draft picks. sports sports league draft pick school
|
||||
/film/film/distributors./film/film_film_distributor_relationship/region film film distributors. film film film distributor relationship region
|
||||
/government/politician/government_positions_held./government/government_position_held/legislative_sessions government politician government positions held. government government position held legislative sessions
|
||||
/organization/role/leaders./organization/leadership/organization organization role leaders. organization leadership organization
|
||||
/tv/tv_network/programs./tv/tv_network_duration/program tv tv network programs. tv tv network duration program
|
||||
/soccer/football_team/current_roster./sports/sports_team_roster/position soccer football team current roster. sports sports team roster position
|
||||
/music/instrument/instrumentalists music instrument instrumentalists
|
||||
/business/business_operation/operating_income./measurement_unit/dated_money_value/currency business business operation operating income. measurement unit dated money value currency
|
||||
/people/cause_of_death/people people cause of death people
|
||||
/film/film/film_art_direction_by film film film art direction by
|
||||
/people/person/sibling_s./people/sibling_relationship/sibling people person sibling s. people sibling relationship sibling
|
||||
/film/film/cinematography film film cinematography
|
||||
/film/actor/dubbing_performances./film/dubbing_performance/language film actor dubbing performances. film dubbing performance language
|
||||
/base/biblioness/bibs_location/state base biblioness bibs location state
|
||||
/base/petbreeds/city_with_dogs/top_breeds./base/petbreeds/dog_city_relationship/dog_breed base petbreeds city with dogs top breeds. base petbreeds dog city relationship dog breed
|
||||
/people/person/gender people person gender
|
||||
/education/field_of_study/students_majoring./education/education/student education field of study students majoring. education education student
|
||||
/base/popstra/celebrity/dated./base/popstra/dated/participant base popstra celebrity dated. base popstra dated participant
|
||||
/sports/sports_team/roster./american_football/football_roster_position/position sports sports team roster. american football football roster position position
|
||||
/award/award_winner/awards_won./award/award_honor/award_winner award award winner awards won. award award honor award winner
|
||||
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/olympics olympics olympic participating country medals won. olympics olympic medal honor olympics
|
||||
/film/director/film film director film
|
||||
/tv/tv_producer/programs_produced./tv/tv_producer_term/program tv tv producer programs produced. tv tv producer term program
|
||||
/film/film_distributor/films_distributed./film/film_film_distributor_relationship/film film film distributor films distributed. film film film distributor relationship film
|
||||
/olympics/olympic_games/sports olympics olympic games sports
|
||||
/music/record_label/artist music record label artist
|
||||
/education/university/local_tuition./measurement_unit/dated_money_value/currency education university local tuition. measurement unit dated money value currency
|
||||
/film/film/story_by film film story by
|
||||
/people/person/spouse_s./people/marriage/spouse people person spouse s. people marriage spouse
|
||||
/sports/sports_league/teams./sports/sports_league_participation/team sports sports league teams. sports sports league participation team
|
||||
/people/profession/specialization_of people profession specialization of
|
||||
/base/americancomedy/celebrity_impressionist/celebrities_impersonated base americancomedy celebrity impressionist celebrities impersonated
|
||||
/tv/tv_program/genre tv tv program genre
|
||||
/award/award_category/nominees./award/award_nomination/nominated_for award award category nominees. award award nomination nominated for
|
||||
/language/human_language/countries_spoken_in language human language countries spoken in
|
||||
/organization/organization/headquarters./location/mailing_address/country organization organization headquarters. location mailing address country
|
||||
/location/statistical_region/gdp_real./measurement_unit/adjusted_money_value/adjustment_currency location statistical region gdp real. measurement unit adjusted money value adjustment currency
|
||||
/education/university/fraternities_and_sororities education university fraternities and sororities
|
||||
/award/award_nominee/award_nominations./award/award_nomination/award_nominee award award nominee award nominations. award award nomination award nominee
|
||||
/military/military_combatant/military_conflicts./military/military_combatant_group/combatants military military combatant military conflicts. military military combatant group combatants
|
||||
/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for award award nominated work award nominations. award award nomination nominated for
|
||||
/location/location/time_zones location location time zones
|
||||
/film/film/dubbing_performances./film/dubbing_performance/actor film film dubbing performances. film dubbing performance actor
|
||||
/film/film_subject/films film film subject films
|
||||
/education/educational_degree/people_with_this_degree./education/education/institution education educational degree people with this degree. education education institution
|
||||
/education/educational_institution/colors education educational institution colors
|
||||
/award/award_category/category_of award award category category of
|
||||
/tv/tv_personality/tv_regular_appearances./tv/tv_regular_personal_appearance/program tv tv personality tv regular appearances. tv tv regular personal appearance program
|
||||
/film/film/language film film language
|
||||
/music/group_member/membership./music/group_membership/group music group member membership. music group membership group
|
||||
/business/business_operation/revenue./measurement_unit/dated_money_value/currency business business operation revenue. measurement unit dated money value currency
|
||||
/film/film/film_festivals film film film festivals
|
||||
/film/actor/film./film/performance/special_performance_type film actor film. film performance special performance type
|
||||
/organization/non_profit_organization/registered_with./organization/non_profit_registration/registering_agency organization non profit organization registered with. organization non profit registration registering agency
|
||||
/government/politician/government_positions_held./government/government_position_held/jurisdiction_of_office government politician government positions held. government government position held jurisdiction of office
|
||||
/base/aareas/schema/administrative_area/administrative_parent base aareas schema administrative area administrative parent
|
||||
/award/award_winning_work/awards_won./award/award_honor/award_winner award award winning work awards won. award award honor award winner
|
||||
/organization/organization/place_founded organization organization place founded
|
||||
/soccer/football_player/current_team./sports/sports_team_roster/team soccer football player current team. sports sports team roster team
|
||||
/government/politician/government_positions_held./government/government_position_held/basic_title government politician government positions held. government government position held basic title
|
||||
/music/artist/track_contributions./music/track_contribution/role music artist track contributions. music track contribution role
|
||||
/base/localfood/seasonal_month/produce_available./base/localfood/produce_availability/seasonal_months base localfood seasonal month produce available. base localfood produce availability seasonal months
|
||||
/celebrities/celebrity/celebrity_friends./celebrities/friendship/friend celebrities celebrity celebrity friends. celebrities friendship friend
|
||||
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/school sports professional sports team draft picks. sports sports league draft pick school
|
||||
/award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee award hall of fame inductees. award hall of fame induction inductee
|
||||
/influence/influence_node/peers./influence/peer_relationship/peers influence influence node peers. influence peer relationship peers
|
||||
/medicine/disease/risk_factors medicine disease risk factors
|
||||
/broadcast/content/artist broadcast content artist
|
||||
/film/film/estimated_budget./measurement_unit/dated_money_value/currency film film estimated budget. measurement unit dated money value currency
|
||||
/military/military_conflict/combatants./military/military_combatant_group/combatants military military conflict combatants. military military combatant group combatants
|
||||
/location/capital_of_administrative_division/capital_of./location/administrative_division_capital_relationship/administrative_division location capital of administrative division capital of. location administrative division capital relationship administrative division
|
||||
/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor tv tv program regular cast. tv regular tv appearance actor
|
||||
/people/deceased_person/place_of_burial people deceased person place of burial
|
||||
/location/location/adjoin_s./location/adjoining_relationship/adjoins location location adjoin s. location adjoining relationship adjoins
|
||||
/music/group_member/membership./music/group_membership/role music group member membership. music group membership role
|
||||
/award/award_ceremony/awards_presented./award/award_honor/award_winner award award ceremony awards presented. award award honor award winner
|
||||
/film/film/prequel film film prequel
|
||||
/film/film/produced_by film film produced by
|
||||
/tv/tv_program/tv_producer./tv/tv_producer_term/producer_type tv tv program tv producer. tv tv producer term producer type
|
||||
/sports/sports_position/players./sports/sports_team_roster/team sports sports position players. sports sports team roster team
|
||||
/olympics/olympic_games/participating_countries olympics olympic games participating countries
|
||||
/music/genre/parent_genre music genre parent genre
|
||||
/tv/tv_writer/tv_programs./tv/tv_program_writer_relationship/tv_program tv tv writer tv programs. tv tv program writer relationship tv program
|
||||
/music/genre/artists music genre artists
|
||||
/film/film/genre film film genre
|
||||
/people/person/employment_history./business/employment_tenure/company people person employment history. business employment tenure company
|
||||
/education/university/domestic_tuition./measurement_unit/dated_money_value/currency education university domestic tuition. measurement unit dated money value currency
|
||||
/people/person/nationality people person nationality
|
||||
/location/country/capital location country capital
|
||||
/location/statistical_region/gni_per_capita_in_ppp_dollars./measurement_unit/dated_money_value/currency location statistical region gni per capita in ppp dollars. measurement unit dated money value currency
|
||||
/base/aareas/schema/administrative_area/capital base aareas schema administrative area capital
|
||||
/business/business_operation/industry business business operation industry
|
||||
/location/hud_foreclosure_area/estimated_number_of_mortgages./measurement_unit/dated_integer/source location hud foreclosure area estimated number of mortgages. measurement unit dated integer source
|
||||
/film/film/other_crew./film/film_crew_gig/crewmember film film other crew. film film crew gig crewmember
|
||||
/base/popstra/location/vacationers./base/popstra/vacation_choice/vacationer base popstra location vacationers. base popstra vacation choice vacationer
|
||||
/film/film/film_format film film film format
|
||||
/medicine/disease/notable_people_with_this_condition medicine disease notable people with this condition
|
||||
/film/film/costume_design_by film film costume design by
|
||||
/government/government_office_category/officeholders./government/government_position_held/jurisdiction_of_office government government office category officeholders. government government position held jurisdiction of office
|
||||
/location/statistical_region/gdp_nominal./measurement_unit/dated_money_value/currency location statistical region gdp nominal. measurement unit dated money value currency
|
||||
/sports/sports_team/roster./baseball/baseball_roster_position/position sports sports team roster. baseball baseball roster position position
|
||||
/award/award_winning_work/awards_won./award/award_honor/honored_for award award winning work awards won. award award honor honored for
|
||||
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/olympics olympics olympic sport athletes. olympics olympic athlete affiliation olympics
|
||||
/celebrities/celebrity/sexual_relationships./celebrities/romantic_relationship/celebrity celebrities celebrity sexual relationships. celebrities romantic relationship celebrity
|
||||
/people/marriage_union_type/unions_of_this_type./people/marriage/location_of_ceremony people marriage union type unions of this type. people marriage location of ceremony
|
||||
/organization/organization/child./organization/organization_relationship/child organization organization child. organization organization relationship child
|
||||
/organization/organization_founder/organizations_founded organization organization founder organizations founded
|
||||
/sports/sports_team/sport sports sports team sport
|
||||
/people/ethnicity/geographic_distribution people ethnicity geographic distribution
|
||||
/location/statistical_region/places_exported_to./location/imports_and_exports/exported_to location statistical region places exported to. location imports and exports exported to
|
||||
/location/country/official_language location country official language
|
||||
/film/film/production_companies film film production companies
|
||||
/user/jg/default_domain/olympic_games/sports user jg default domain olympic games sports
|
||||
/time/event/locations time event locations
|
||||
/people/person/spouse_s./people/marriage/type_of_union people person spouse s. people marriage type of union
|
||||
/government/governmental_body/members./government/government_position_held/legislative_sessions government governmental body members. government government position held legislative sessions
|
||||
/media_common/netflix_genre/titles media common netflix genre titles
|
||||
/user/alexander/philosophy/philosopher/interests user alexander philosophy philosopher interests
|
||||
/film/film/runtime./film/film_cut/film_release_region film film runtime. film film cut film release region
|
||||
/education/educational_institution/students_graduates./education/education/student education educational institution students graduates. education education student
|
||||
/base/eating/practicer_of_diet/diet base eating practicer of diet diet
|
||||
/tv/non_character_role/tv_regular_personal_appearances./tv/tv_regular_personal_appearance/person tv non character role tv regular personal appearances. tv tv regular personal appearance person
|
||||
/sports/sports_position/players./sports/sports_team_roster/position sports sports position players. sports sports team roster position
|
||||
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/draft sports professional sports team draft picks. sports sports league draft pick draft
|
||||
/medicine/symptom/symptom_of medicine symptom symptom of
|
||||
/film/person_or_entity_appearing_in_film/films./film/personal_film_appearance/type_of_appearance film person or entity appearing in film films. film personal film appearance type of appearance
|
||||
/sports/sports_team_location/teams sports sports team location teams
|
||||
/american_football/football_team/current_roster./sports/sports_team_roster/position american football football team current roster. sports sports team roster position
|
||||
/people/person/places_lived./people/place_lived/location people person places lived. people place lived location
|
||||
/location/statistical_region/rent50_2./measurement_unit/dated_money_value/currency location statistical region rent50 2. measurement unit dated money value currency
|
||||
/film/film/personal_appearances./film/personal_film_appearance/person film film personal appearances. film personal film appearance person
|
||||
/music/instrument/family music instrument family
|
||||
/sports/sports_team/roster./basketball/basketball_roster_position/position sports sports team roster. basketball basketball roster position position
|
||||
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_location base schemastaging organization extra phone number. base schemastaging phone sandbox service location
|
||||
/film/film/release_date_s./film/film_regional_release_date/film_release_region film film release date s. film film regional release date film release region
|
||||
/award/award_category/disciplines_or_subjects award award category disciplines or subjects
|
||||
/base/popstra/celebrity/friendship./base/popstra/friendship/participant base popstra celebrity friendship. base popstra friendship participant
|
||||
/music/performance_role/regular_performances./music/group_membership/group music performance role regular performances. music group membership group
|
||||
/film/film/edited_by film film edited by
|
||||
/base/x2010fifaworldcupsouthafrica/world_cup_squad/current_world_cup_squad./base/x2010fifaworldcupsouthafrica/current_world_cup_squad/current_club base x2010fifaworldcupsouthafrica world cup squad current world cup squad. base x2010fifaworldcupsouthafrica current world cup squad current club
|
||||
/base/popstra/celebrity/canoodled./base/popstra/canoodled/participant base popstra celebrity canoodled. base popstra canoodled participant
|
||||
/film/film/release_date_s./film/film_regional_release_date/film_release_distribution_medium film film release date s. film film regional release date film release distribution medium
|
||||
/film/film/other_crew./film/film_crew_gig/film_crew_role film film other crew. film film crew gig film crew role
|
||||
/base/popstra/celebrity/breakup./base/popstra/breakup/participant base popstra celebrity breakup. base popstra breakup participant
|
||||
/film/film/country film film country
|
||||
/music/performance_role/regular_performances./music/group_membership/role music performance role regular performances. music group membership role
|
||||
/sports/sports_team/roster./american_football/football_historical_roster_position/position_s sports sports team roster. american football football historical roster position position s
|
||||
/film/film/release_date_s./film/film_regional_release_date/film_regional_debut_venue film film release date s. film film regional release date film regional debut venue
|
||||
/time/event/instance_of_recurring_event time event instance of recurring event
|
||||
/olympics/olympic_participating_country/athletes./olympics/olympic_athlete_affiliation/olympics olympics olympic participating country athletes. olympics olympic athlete affiliation olympics
|
||||
/organization/endowed_organization/endowment./measurement_unit/dated_money_value/currency organization endowed organization endowment. measurement unit dated money value currency
|
||||
/travel/travel_destination/how_to_get_here./travel/transportation/mode_of_transportation travel travel destination how to get here. travel transportation mode of transportation
|
||||
/baseball/baseball_team/team_stats./baseball/baseball_team_stats/season baseball baseball team team stats. baseball baseball team stats season
|
||||
/award/award_category/winners./award/award_honor/ceremony award award category winners. award award honor ceremony
|
||||
/government/legislative_session/members./government/government_position_held/district_represented government legislative session members. government government position held district represented
|
||||
/influence/influence_node/influenced_by influence influence node influenced by
|
||||
/base/culturalevent/event/entity_involved base culturalevent event entity involved
|
||||
/people/ethnicity/people people ethnicity people
|
||||
/sports/sport/pro_athletes./sports/pro_sports_played/athlete sports sport pro athletes. sports pro sports played athlete
|
||||
/location/statistical_region/gdp_nominal_per_capita./measurement_unit/dated_money_value/currency location statistical region gdp nominal per capita. measurement unit dated money value currency
|
||||
/location/hud_county_place/place location hud county place place
|
||||
/base/aareas/schema/administrative_area/administrative_area_type base aareas schema administrative area administrative area type
|
||||
/base/locations/continents/countries_within base locations continents countries within
|
||||
/sports/sports_position/players./american_football/football_historical_roster_position/position_s sports sports position players. american football football historical roster position position s
|
||||
/people/person/spouse_s./people/marriage/location_of_ceremony people person spouse s. people marriage location of ceremony
|
||||
/education/educational_institution/students_graduates./education/education/major_field_of_study education educational institution students graduates. education education major field of study
|
||||
/film/film/written_by film film written by
|
||||
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/country olympics olympic sport athletes. olympics olympic athlete affiliation country
|
||||
/music/performance_role/guest_performances./music/recording_contribution/performance_role music performance role guest performances. music recording contribution performance role
|
||||
/film/film/featured_film_locations film film featured film locations
|
||||
/education/educational_institution_campus/educational_institution education educational institution campus educational institution
|
||||
/sports/pro_athlete/teams./sports/sports_team_roster/team sports pro athlete teams. sports sports team roster team
|
||||
/people/ethnicity/languages_spoken people ethnicity languages spoken
|
||||
/film/film/executive_produced_by film film executive produced by
|
||||
/tv/tv_producer/programs_produced./tv/tv_producer_term/producer_type tv tv producer programs produced. tv tv producer term producer type
|
||||
/location/location/contains location location contains
|
||||
/base/biblioness/bibs_location/country base biblioness bibs location country
|
||||
/user/ktrueman/default_domain/international_organization/member_states user ktrueman default domain international organization member states
|
||||
/music/performance_role/track_performances./music/track_contribution/role music performance role track performances. music track contribution role
|
||||
/olympics/olympic_games/medals_awarded./olympics/olympic_medal_honor/medal olympics olympic games medals awarded. olympics olympic medal honor medal
|
||||
/base/saturdaynightlive/snl_cast_member/seasons./base/saturdaynightlive/snl_season_tenure/cast_members base saturdaynightlive snl cast member seasons. base saturdaynightlive snl season tenure cast members
|
237
dataset/FB15k-237/relations.txt
Normal file
237
dataset/FB15k-237/relations.txt
Normal file
@ -0,0 +1,237 @@
|
||||
/soccer/football_team/current_roster./soccer/football_roster_position/position
|
||||
/music/artist/origin
|
||||
/ice_hockey/hockey_team/current_roster./sports/sports_team_roster/position
|
||||
/food/food/nutrients./food/nutrition_fact/nutrient
|
||||
/film/actor/film./film/performance/film
|
||||
/award/award_nominee/award_nominations./award/award_nomination/nominated_for
|
||||
/government/political_party/politicians_in_this_party./government/political_party_tenure/politician
|
||||
/base/schemastaging/person_extra/net_worth./measurement_unit/dated_money_value/currency
|
||||
/people/deceased_person/place_of_death
|
||||
/people/person/profession
|
||||
/location/administrative_division/first_level_division_of
|
||||
/base/marchmadness/ncaa_basketball_tournament/seeds./base/marchmadness/ncaa_tournament_seed/team
|
||||
/education/university/international_tuition./measurement_unit/dated_money_value/currency
|
||||
/location/us_county/county_seat
|
||||
/location/location/partially_contains
|
||||
/tv/tv_program/program_creator
|
||||
/film/film/music
|
||||
/tv/tv_program/languages
|
||||
/common/topic/webpage./common/webpage/category
|
||||
/user/tsegaran/random/taxonomy_subject/entry./user/tsegaran/random/taxonomy_entry/taxonomy
|
||||
/education/field_of_study/students_majoring./education/education/major_field_of_study
|
||||
/business/business_operation/assets./measurement_unit/dated_money_value/currency
|
||||
/film/film_set_designer/film_sets_designed
|
||||
/dataworld/gardening_hint/split_to
|
||||
/people/person/languages
|
||||
/business/job_title/people_with_this_title./business/employment_tenure/company
|
||||
/location/country/form_of_government
|
||||
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_language
|
||||
/people/person/place_of_birth
|
||||
/sports/sports_team/colors
|
||||
/education/educational_institution/school_type
|
||||
/award/award_category/winners./award/award_honor/award_winner
|
||||
/organization/organization/headquarters./location/mailing_address/citytown
|
||||
/education/educational_degree/people_with_this_degree./education/education/student
|
||||
/government/legislative_session/members./government/government_position_held/legislative_sessions
|
||||
/film/film/distributors./film/film_film_distributor_relationship/film_distribution_medium
|
||||
/education/educational_degree/people_with_this_degree./education/education/major_field_of_study
|
||||
/location/hud_county_place/county
|
||||
/location/administrative_division/country
|
||||
/film/film/film_production_design_by
|
||||
/award/award_winning_work/awards_won./award/award_honor/award
|
||||
/organization/organization/headquarters./location/mailing_address/state_province_region
|
||||
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/contact_category
|
||||
/tv/tv_program/country_of_origin
|
||||
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/medal
|
||||
/location/country/second_level_divisions
|
||||
/award/award_ceremony/awards_presented./award/award_honor/honored_for
|
||||
/organization/organization_member/member_of./organization/organization_membership/organization
|
||||
/education/educational_institution/campuses
|
||||
/music/artist/contribution./music/recording_contribution/performance_role
|
||||
/award/ranked_item/appears_in_ranked_lists./award/ranking/list
|
||||
/people/person/religion
|
||||
/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month
|
||||
/film/special_film_performance_type/film_performance_type./film/performance/film
|
||||
/award/award_nominee/award_nominations./award/award_nomination/award
|
||||
/location/statistical_region/religions./location/religion_percentage/religion
|
||||
/sports/sports_league_draft/picks./sports/sports_league_draft_pick/school
|
||||
/film/film/distributors./film/film_film_distributor_relationship/region
|
||||
/government/politician/government_positions_held./government/government_position_held/legislative_sessions
|
||||
/organization/role/leaders./organization/leadership/organization
|
||||
/tv/tv_network/programs./tv/tv_network_duration/program
|
||||
/soccer/football_team/current_roster./sports/sports_team_roster/position
|
||||
/music/instrument/instrumentalists
|
||||
/business/business_operation/operating_income./measurement_unit/dated_money_value/currency
|
||||
/people/cause_of_death/people
|
||||
/film/film/film_art_direction_by
|
||||
/people/person/sibling_s./people/sibling_relationship/sibling
|
||||
/film/film/cinematography
|
||||
/film/actor/dubbing_performances./film/dubbing_performance/language
|
||||
/base/biblioness/bibs_location/state
|
||||
/base/petbreeds/city_with_dogs/top_breeds./base/petbreeds/dog_city_relationship/dog_breed
|
||||
/people/person/gender
|
||||
/education/field_of_study/students_majoring./education/education/student
|
||||
/base/popstra/celebrity/dated./base/popstra/dated/participant
|
||||
/sports/sports_team/roster./american_football/football_roster_position/position
|
||||
/award/award_winner/awards_won./award/award_honor/award_winner
|
||||
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/olympics
|
||||
/film/director/film
|
||||
/tv/tv_producer/programs_produced./tv/tv_producer_term/program
|
||||
/film/film_distributor/films_distributed./film/film_film_distributor_relationship/film
|
||||
/olympics/olympic_games/sports
|
||||
/music/record_label/artist
|
||||
/education/university/local_tuition./measurement_unit/dated_money_value/currency
|
||||
/film/film/story_by
|
||||
/people/person/spouse_s./people/marriage/spouse
|
||||
/sports/sports_league/teams./sports/sports_league_participation/team
|
||||
/people/profession/specialization_of
|
||||
/base/americancomedy/celebrity_impressionist/celebrities_impersonated
|
||||
/tv/tv_program/genre
|
||||
/award/award_category/nominees./award/award_nomination/nominated_for
|
||||
/language/human_language/countries_spoken_in
|
||||
/organization/organization/headquarters./location/mailing_address/country
|
||||
/location/statistical_region/gdp_real./measurement_unit/adjusted_money_value/adjustment_currency
|
||||
/education/university/fraternities_and_sororities
|
||||
/award/award_nominee/award_nominations./award/award_nomination/award_nominee
|
||||
/military/military_combatant/military_conflicts./military/military_combatant_group/combatants
|
||||
/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for
|
||||
/location/location/time_zones
|
||||
/film/film/dubbing_performances./film/dubbing_performance/actor
|
||||
/film/film_subject/films
|
||||
/education/educational_degree/people_with_this_degree./education/education/institution
|
||||
/education/educational_institution/colors
|
||||
/award/award_category/category_of
|
||||
/tv/tv_personality/tv_regular_appearances./tv/tv_regular_personal_appearance/program
|
||||
/film/film/language
|
||||
/music/group_member/membership./music/group_membership/group
|
||||
/business/business_operation/revenue./measurement_unit/dated_money_value/currency
|
||||
/film/film/film_festivals
|
||||
/film/actor/film./film/performance/special_performance_type
|
||||
/organization/non_profit_organization/registered_with./organization/non_profit_registration/registering_agency
|
||||
/government/politician/government_positions_held./government/government_position_held/jurisdiction_of_office
|
||||
/base/aareas/schema/administrative_area/administrative_parent
|
||||
/award/award_winning_work/awards_won./award/award_honor/award_winner
|
||||
/organization/organization/place_founded
|
||||
/soccer/football_player/current_team./sports/sports_team_roster/team
|
||||
/government/politician/government_positions_held./government/government_position_held/basic_title
|
||||
/music/artist/track_contributions./music/track_contribution/role
|
||||
/base/localfood/seasonal_month/produce_available./base/localfood/produce_availability/seasonal_months
|
||||
/celebrities/celebrity/celebrity_friends./celebrities/friendship/friend
|
||||
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/school
|
||||
/award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee
|
||||
/influence/influence_node/peers./influence/peer_relationship/peers
|
||||
/medicine/disease/risk_factors
|
||||
/broadcast/content/artist
|
||||
/film/film/estimated_budget./measurement_unit/dated_money_value/currency
|
||||
/military/military_conflict/combatants./military/military_combatant_group/combatants
|
||||
/location/capital_of_administrative_division/capital_of./location/administrative_division_capital_relationship/administrative_division
|
||||
/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor
|
||||
/people/deceased_person/place_of_burial
|
||||
/location/location/adjoin_s./location/adjoining_relationship/adjoins
|
||||
/music/group_member/membership./music/group_membership/role
|
||||
/award/award_ceremony/awards_presented./award/award_honor/award_winner
|
||||
/film/film/prequel
|
||||
/film/film/produced_by
|
||||
/tv/tv_program/tv_producer./tv/tv_producer_term/producer_type
|
||||
/sports/sports_position/players./sports/sports_team_roster/team
|
||||
/olympics/olympic_games/participating_countries
|
||||
/music/genre/parent_genre
|
||||
/tv/tv_writer/tv_programs./tv/tv_program_writer_relationship/tv_program
|
||||
/music/genre/artists
|
||||
/film/film/genre
|
||||
/people/person/employment_history./business/employment_tenure/company
|
||||
/education/university/domestic_tuition./measurement_unit/dated_money_value/currency
|
||||
/people/person/nationality
|
||||
/location/country/capital
|
||||
/location/statistical_region/gni_per_capita_in_ppp_dollars./measurement_unit/dated_money_value/currency
|
||||
/base/aareas/schema/administrative_area/capital
|
||||
/business/business_operation/industry
|
||||
/location/hud_foreclosure_area/estimated_number_of_mortgages./measurement_unit/dated_integer/source
|
||||
/film/film/other_crew./film/film_crew_gig/crewmember
|
||||
/base/popstra/location/vacationers./base/popstra/vacation_choice/vacationer
|
||||
/film/film/film_format
|
||||
/medicine/disease/notable_people_with_this_condition
|
||||
/film/film/costume_design_by
|
||||
/government/government_office_category/officeholders./government/government_position_held/jurisdiction_of_office
|
||||
/location/statistical_region/gdp_nominal./measurement_unit/dated_money_value/currency
|
||||
/sports/sports_team/roster./baseball/baseball_roster_position/position
|
||||
/award/award_winning_work/awards_won./award/award_honor/honored_for
|
||||
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/olympics
|
||||
/celebrities/celebrity/sexual_relationships./celebrities/romantic_relationship/celebrity
|
||||
/people/marriage_union_type/unions_of_this_type./people/marriage/location_of_ceremony
|
||||
/organization/organization/child./organization/organization_relationship/child
|
||||
/organization/organization_founder/organizations_founded
|
||||
/sports/sports_team/sport
|
||||
/people/ethnicity/geographic_distribution
|
||||
/location/statistical_region/places_exported_to./location/imports_and_exports/exported_to
|
||||
/location/country/official_language
|
||||
/film/film/production_companies
|
||||
/user/jg/default_domain/olympic_games/sports
|
||||
/time/event/locations
|
||||
/people/person/spouse_s./people/marriage/type_of_union
|
||||
/government/governmental_body/members./government/government_position_held/legislative_sessions
|
||||
/media_common/netflix_genre/titles
|
||||
/user/alexander/philosophy/philosopher/interests
|
||||
/film/film/runtime./film/film_cut/film_release_region
|
||||
/education/educational_institution/students_graduates./education/education/student
|
||||
/base/eating/practicer_of_diet/diet
|
||||
/tv/non_character_role/tv_regular_personal_appearances./tv/tv_regular_personal_appearance/person
|
||||
/sports/sports_position/players./sports/sports_team_roster/position
|
||||
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/draft
|
||||
/medicine/symptom/symptom_of
|
||||
/film/person_or_entity_appearing_in_film/films./film/personal_film_appearance/type_of_appearance
|
||||
/sports/sports_team_location/teams
|
||||
/american_football/football_team/current_roster./sports/sports_team_roster/position
|
||||
/people/person/places_lived./people/place_lived/location
|
||||
/location/statistical_region/rent50_2./measurement_unit/dated_money_value/currency
|
||||
/film/film/personal_appearances./film/personal_film_appearance/person
|
||||
/music/instrument/family
|
||||
/sports/sports_team/roster./basketball/basketball_roster_position/position
|
||||
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_location
|
||||
/film/film/release_date_s./film/film_regional_release_date/film_release_region
|
||||
/award/award_category/disciplines_or_subjects
|
||||
/base/popstra/celebrity/friendship./base/popstra/friendship/participant
|
||||
/music/performance_role/regular_performances./music/group_membership/group
|
||||
/film/film/edited_by
|
||||
/base/x2010fifaworldcupsouthafrica/world_cup_squad/current_world_cup_squad./base/x2010fifaworldcupsouthafrica/current_world_cup_squad/current_club
|
||||
/base/popstra/celebrity/canoodled./base/popstra/canoodled/participant
|
||||
/film/film/release_date_s./film/film_regional_release_date/film_release_distribution_medium
|
||||
/film/film/other_crew./film/film_crew_gig/film_crew_role
|
||||
/base/popstra/celebrity/breakup./base/popstra/breakup/participant
|
||||
/film/film/country
|
||||
/music/performance_role/regular_performances./music/group_membership/role
|
||||
/sports/sports_team/roster./american_football/football_historical_roster_position/position_s
|
||||
/film/film/release_date_s./film/film_regional_release_date/film_regional_debut_venue
|
||||
/time/event/instance_of_recurring_event
|
||||
/olympics/olympic_participating_country/athletes./olympics/olympic_athlete_affiliation/olympics
|
||||
/organization/endowed_organization/endowment./measurement_unit/dated_money_value/currency
|
||||
/travel/travel_destination/how_to_get_here./travel/transportation/mode_of_transportation
|
||||
/baseball/baseball_team/team_stats./baseball/baseball_team_stats/season
|
||||
/award/award_category/winners./award/award_honor/ceremony
|
||||
/government/legislative_session/members./government/government_position_held/district_represented
|
||||
/influence/influence_node/influenced_by
|
||||
/base/culturalevent/event/entity_involved
|
||||
/people/ethnicity/people
|
||||
/sports/sport/pro_athletes./sports/pro_sports_played/athlete
|
||||
/location/statistical_region/gdp_nominal_per_capita./measurement_unit/dated_money_value/currency
|
||||
/location/hud_county_place/place
|
||||
/base/aareas/schema/administrative_area/administrative_area_type
|
||||
/base/locations/continents/countries_within
|
||||
/sports/sports_position/players./american_football/football_historical_roster_position/position_s
|
||||
/people/person/spouse_s./people/marriage/location_of_ceremony
|
||||
/education/educational_institution/students_graduates./education/education/major_field_of_study
|
||||
/film/film/written_by
|
||||
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/country
|
||||
/music/performance_role/guest_performances./music/recording_contribution/performance_role
|
||||
/film/film/featured_film_locations
|
||||
/education/educational_institution_campus/educational_institution
|
||||
/sports/pro_athlete/teams./sports/sports_team_roster/team
|
||||
/people/ethnicity/languages_spoken
|
||||
/film/film/executive_produced_by
|
||||
/tv/tv_producer/programs_produced./tv/tv_producer_term/producer_type
|
||||
/location/location/contains
|
||||
/base/biblioness/bibs_location/country
|
||||
/user/ktrueman/default_domain/international_organization/member_states
|
||||
/music/performance_role/track_performances./music/track_contribution/role
|
||||
/olympics/olympic_games/medals_awarded./olympics/olympic_medal_honor/medal
|
||||
/base/saturdaynightlive/snl_cast_member/seasons./base/saturdaynightlive/snl_season_tenure/cast_members
|
20466
dataset/FB15k-237/test.tsv
Normal file
20466
dataset/FB15k-237/test.tsv
Normal file
File diff suppressed because it is too large
Load Diff
272115
dataset/FB15k-237/train.tsv
Normal file
272115
dataset/FB15k-237/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
3034
dataset/WN18RR/dev.tsv
Normal file
3034
dataset/WN18RR/dev.tsv
Normal file
File diff suppressed because it is too large
Load Diff
40943
dataset/WN18RR/entities.txt
Normal file
40943
dataset/WN18RR/entities.txt
Normal file
File diff suppressed because it is too large
Load Diff
40943
dataset/WN18RR/entity2text.txt
Normal file
40943
dataset/WN18RR/entity2text.txt
Normal file
File diff suppressed because it is too large
Load Diff
40943
dataset/WN18RR/entity2text_clean.txt
Normal file
40943
dataset/WN18RR/entity2text_clean.txt
Normal file
File diff suppressed because it is too large
Load Diff
155
dataset/WN18RR/get_neighbor.ipynb
Normal file
155
dataset/WN18RR/get_neighbor.ipynb
Normal file
@ -0,0 +1,155 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path1 = './entities.txt'\n",
|
||||
"path2 = './relations.txt'\n",
|
||||
"path3 = './train.tsv'\n",
|
||||
"path4 = './dev.tsv'\n",
|
||||
"path5 = './test.tsv'\n",
|
||||
"path6 = './get_neighbor/entity2id.txt'\n",
|
||||
"path7 = './get_neighbor/relation2id.txt'\n",
|
||||
"path8 = './get_neighbor/train2id.txt'\n",
|
||||
"path9 = './get_neighbor/valid2id.txt'\n",
|
||||
"path10 = './get_neighbor/test2id.txt'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path1, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"cnt = 0\n",
|
||||
"with open(path6, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" en = line.strip()\n",
|
||||
" f.write(en + '\\t' + str(cnt) + '\\n')\n",
|
||||
" cnt += 1\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path2, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"cnt = 0\n",
|
||||
"with open(path7, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" re = line.strip()\n",
|
||||
" f.write(re + '\\t' + str(cnt) + '\\n')\n",
|
||||
" cnt += 1\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path6, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"en2id = {}\n",
|
||||
"for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" en, num = b[0], b[1]\n",
|
||||
" en2id[en] = num"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path7, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"re2id = {}\n",
|
||||
"for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" re, num = b[0], b[1]\n",
|
||||
" re2id[re] = num"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path3, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"with open(path8, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" h, r, t = b[0], b[1], b[2]\n",
|
||||
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path4, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"with open(path9, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" h, r, t = b[0], b[1], b[2]\n",
|
||||
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path5, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"with open(path10, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" h, r, t = b[0], b[1], b[2]\n",
|
||||
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
|
||||
" "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python [default]",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
40943
dataset/WN18RR/get_neighbor/entity2id.txt
Normal file
40943
dataset/WN18RR/get_neighbor/entity2id.txt
Normal file
File diff suppressed because it is too large
Load Diff
11
dataset/WN18RR/get_neighbor/relation2id.txt
Normal file
11
dataset/WN18RR/get_neighbor/relation2id.txt
Normal file
@ -0,0 +1,11 @@
|
||||
_member_of_domain_usage 0
|
||||
_has_part 1
|
||||
_also_see 2
|
||||
_hypernym 3
|
||||
_synset_domain_topic_of 4
|
||||
_derivationally_related_form 5
|
||||
_similar_to 6
|
||||
_instance_hypernym 7
|
||||
_verb_group 8
|
||||
_member_meronym 9
|
||||
_member_of_domain_region 10
|
3134
dataset/WN18RR/get_neighbor/test2id.txt
Normal file
3134
dataset/WN18RR/get_neighbor/test2id.txt
Normal file
File diff suppressed because it is too large
Load Diff
86835
dataset/WN18RR/get_neighbor/train2id.txt
Normal file
86835
dataset/WN18RR/get_neighbor/train2id.txt
Normal file
File diff suppressed because it is too large
Load Diff
3034
dataset/WN18RR/get_neighbor/valid2id.txt
Normal file
3034
dataset/WN18RR/get_neighbor/valid2id.txt
Normal file
File diff suppressed because it is too large
Load Diff
11
dataset/WN18RR/relation2text.txt
Normal file
11
dataset/WN18RR/relation2text.txt
Normal file
@ -0,0 +1,11 @@
|
||||
_member_of_domain_usage member of domain usage
|
||||
_has_part has part
|
||||
_also_see also see
|
||||
_hypernym hypernym
|
||||
_synset_domain_topic_of synset domain topic of
|
||||
_derivationally_related_form derivationally related form
|
||||
_similar_to similar to
|
||||
_instance_hypernym instance hypernym
|
||||
_verb_group verb group
|
||||
_member_meronym member meronym
|
||||
_member_of_domain_region member of domain region
|
11
dataset/WN18RR/relations.txt
Normal file
11
dataset/WN18RR/relations.txt
Normal file
@ -0,0 +1,11 @@
|
||||
_member_of_domain_usage
|
||||
_has_part
|
||||
_also_see
|
||||
_hypernym
|
||||
_synset_domain_topic_of
|
||||
_derivationally_related_form
|
||||
_similar_to
|
||||
_instance_hypernym
|
||||
_verb_group
|
||||
_member_meronym
|
||||
_member_of_domain_region
|
3134
dataset/WN18RR/test.tsv
Normal file
3134
dataset/WN18RR/test.tsv
Normal file
File diff suppressed because it is too large
Load Diff
86835
dataset/WN18RR/train.tsv
Normal file
86835
dataset/WN18RR/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
151
dataset/create_neighbor.py
Executable file
151
dataset/create_neighbor.py
Executable file
@ -0,0 +1,151 @@
|
||||
from collections import defaultdict
|
||||
import time
|
||||
import argparse
|
||||
id2entity_name = defaultdict(str)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dataset", type=str, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
# dataset_name = 'FB15k-237'
|
||||
|
||||
with open('./' + args.dataset + '/get_neighbor/entity2id.txt', 'r') as file:
|
||||
entity_lines = file.readlines()
|
||||
for line in entity_lines:
|
||||
_name, _id = line.strip().split("\t")
|
||||
id2entity_name[int(_id)] = _name
|
||||
|
||||
id2relation_name = defaultdict(str)
|
||||
|
||||
with open('./' + args.dataset + '/get_neighbor/relation2id.txt', 'r') as file:
|
||||
relation_lines = file.readlines()
|
||||
for line in relation_lines:
|
||||
_name, _id = line.strip().split("\t")
|
||||
id2relation_name[int(_id)] = _name
|
||||
|
||||
train_triplet = []
|
||||
|
||||
|
||||
for line in open('./' + args.dataset + '/get_neighbor/train2id.txt', 'r'):
|
||||
head, relation, tail = line.strip('\n').split()
|
||||
train_triplet.append(list((int(head), int(relation), int(tail))))
|
||||
|
||||
for line in open('./' + args.dataset + '/get_neighbor/test2id.txt', 'r'):
|
||||
head, relation, tail = line.strip('\n').split()
|
||||
train_triplet.append(list((int(head), int(relation), int(tail))))
|
||||
|
||||
for line in open('./'+args.dataset+'/get_neighbor/valid2id.txt', 'r'):
|
||||
head, relation, tail = line.strip('\n').split()
|
||||
train_triplet.append(list((int(head), int(relation), int(tail))))
|
||||
|
||||
|
||||
graph = {}
|
||||
reverse_graph = {}
|
||||
|
||||
def init_graph(graph_triplet):
|
||||
|
||||
for triple in graph_triplet:
|
||||
head = triple[0]
|
||||
rela = triple[1]
|
||||
tail = triple[2]
|
||||
|
||||
if(head not in graph.keys()):
|
||||
graph[head] = {}
|
||||
graph[head][tail] = rela
|
||||
else:
|
||||
graph[head][tail] = rela
|
||||
|
||||
if(tail not in reverse_graph.keys()):
|
||||
reverse_graph[tail] = {}
|
||||
reverse_graph[tail][head] = rela
|
||||
else:
|
||||
reverse_graph[tail][head] = rela
|
||||
|
||||
# return graph, reverse_graph, node_indegree, node_outdegree
|
||||
|
||||
init_graph(train_triplet)
|
||||
|
||||
|
||||
|
||||
import random
|
||||
|
||||
def random_delete(triplet, reserved_num):
|
||||
reserved = random.sample(triplet, reserved_num)
|
||||
return reserved
|
||||
|
||||
def get_onestep_neighbors(graph, source, sample_num):
|
||||
triplet = []
|
||||
try:
|
||||
nei = list(graph[source].keys())
|
||||
# nei = random.sample(graph[source].keys(), sample_num)
|
||||
triplet = [tuple((source, graph[source][nei[i]], nei[i])) for i in range(len(nei))]
|
||||
except KeyError:
|
||||
pass
|
||||
except ValueError:
|
||||
nei = list(graph[source].keys())
|
||||
triplet = [tuple((source, graph[source][nei[i]], nei[i])) for i in range(len(nei))]
|
||||
return triplet
|
||||
|
||||
def get_entity_neighbors(traget_entity, max_triplet):
|
||||
|
||||
as_head_neighbors = get_onestep_neighbors(graph, traget_entity, max_triplet // 2)
|
||||
as_tail_neighbors = get_onestep_neighbors(reverse_graph, traget_entity, max_triplet // 2)
|
||||
|
||||
all_triplet = as_head_neighbors + as_tail_neighbors
|
||||
|
||||
return all_triplet
|
||||
|
||||
def get_triplet(triplet):
|
||||
head_entity = triplet[0]
|
||||
tail_entity = triplet[2]
|
||||
triplet = tuple((triplet[0], triplet[1], triplet[2]))
|
||||
|
||||
head_triplet = get_entity_neighbors(head_entity, 4)
|
||||
tail_triplet = get_entity_neighbors(tail_entity, 4)
|
||||
|
||||
temp_triplet = list(set(head_triplet + tail_triplet))
|
||||
temp_triplet = list(set(temp_triplet) - set([triplet]))
|
||||
# if len(temp_triplet) > 8:
|
||||
# del_triplet = list(set(temp_triplet) - set([triplet]))
|
||||
# temp_triplet = random_delete(del_triplet, 7)
|
||||
|
||||
return temp_triplet
|
||||
|
||||
|
||||
|
||||
import copy
|
||||
|
||||
def change_(triplet_list):
|
||||
tri_text = []
|
||||
for item in triplet_list:
|
||||
# text = id2entity_name[item[0]] + '\t' + id2relation_name[item[1]] + '\t' + id2entity_name[item[2]]
|
||||
h = id2entity_name[item[0]]
|
||||
r = id2relation_name[item[1]]
|
||||
t = id2entity_name[item[2]]
|
||||
tri_text.append([h, r, t])
|
||||
return tri_text
|
||||
|
||||
mask_idx = 99999999
|
||||
masked_tail_neighbor = defaultdict(list)
|
||||
masked_head_neighbor = defaultdict(list)
|
||||
for triplet in train_triplet:
|
||||
tail_masked = copy.deepcopy(triplet)
|
||||
head_masked = copy.deepcopy(triplet)
|
||||
tail_masked[2] = mask_idx
|
||||
head_masked[0] = mask_idx
|
||||
masked_tail_neighbor['\t'.join([id2entity_name[triplet[0]], id2relation_name[triplet[1]]])] = change_(get_triplet(tail_masked))
|
||||
masked_head_neighbor['\t'.join([id2entity_name[triplet[2]], id2relation_name[triplet[1]]])] = change_(get_triplet(head_masked))
|
||||
|
||||
|
||||
import json
|
||||
|
||||
with open("./" + args.dataset + "/masked_tail_neighbor.txt", "w") as file:
|
||||
file.write(json.dumps(masked_tail_neighbor, indent=1))
|
||||
|
||||
with open("./" + args.dataset + "/masked_head_neighbor.txt", "w") as file:
|
||||
file.write(json.dumps(masked_head_neighbor, indent=1))
|
||||
|
||||
|
||||
|
||||
|
||||
|
652
dataset/umls/dev.tsv
Normal file
652
dataset/umls/dev.tsv
Normal file
@ -0,0 +1,652 @@
|
||||
nucleic_acid_nucleoside_or_nucleotide affects mental_or_behavioral_dysfunction
|
||||
patient_or_disabled_group performs individual_behavior
|
||||
neoplastic_process process_of molecular_function
|
||||
lipid affects biologic_function
|
||||
neoplastic_process affects alga
|
||||
antibiotic affects cell_or_molecular_dysfunction
|
||||
eicosanoid affects mental_or_behavioral_dysfunction
|
||||
fully_formed_anatomical_structure location_of injury_or_poisoning
|
||||
machine_activity method_of laboratory_procedure
|
||||
cell_or_molecular_dysfunction isa pathologic_function
|
||||
molecular_biology_research_technique measures organism_function
|
||||
organism_function affects animal
|
||||
patient_or_disabled_group performs governmental_or_regulatory_activity
|
||||
laboratory_procedure measures mental_process
|
||||
tissue surrounds body_space_or_junction
|
||||
anatomical_abnormality affects organism_function
|
||||
plant location_of biologically_active_substance
|
||||
pathologic_function degree_of mental_or_behavioral_dysfunction
|
||||
rickettsia_or_chlamydia location_of neuroreactive_substance_or_biogenic_amine
|
||||
steroid causes anatomical_abnormality
|
||||
organophosphorus_compound isa organic_chemical
|
||||
amino_acid_peptide_or_protein interacts_with eicosanoid
|
||||
age_group produces research_device
|
||||
acquired_abnormality result_of genetic_function
|
||||
organic_chemical interacts_with chemical
|
||||
invertebrate interacts_with fish
|
||||
gene_or_genome produces nucleic_acid_nucleoside_or_nucleotide
|
||||
enzyme isa biologically_active_substance
|
||||
cell location_of body_space_or_junction
|
||||
clinical_attribute degree_of organism_attribute
|
||||
vitamin isa biologically_active_substance
|
||||
animal interacts_with mammal
|
||||
injury_or_poisoning result_of experimental_model_of_disease
|
||||
organism_function co-occurs_with physiologic_function
|
||||
amino_acid_peptide_or_protein interacts_with biologically_active_substance
|
||||
pharmacologic_substance disrupts cell_function
|
||||
mental_process process_of bird
|
||||
acquired_abnormality result_of social_behavior
|
||||
research_activity measures temporal_concept
|
||||
steroid isa substance
|
||||
molecular_function process_of cell_function
|
||||
professional_or_occupational_group performs health_care_activity
|
||||
disease_or_syndrome result_of physiologic_function
|
||||
body_location_or_region location_of injury_or_poisoning
|
||||
antibiotic complicates acquired_abnormality
|
||||
organophosphorus_compound interacts_with amino_acid_peptide_or_protein
|
||||
neuroreactive_substance_or_biogenic_amine complicates injury_or_poisoning
|
||||
gene_or_genome produces body_substance
|
||||
injury_or_poisoning associated_with clinical_attribute
|
||||
cell_function affects human
|
||||
finding associated_with injury_or_poisoning
|
||||
laboratory_procedure measures organic_chemical
|
||||
disease_or_syndrome result_of mental_or_behavioral_dysfunction
|
||||
physiologic_function precedes mental_process
|
||||
body_space_or_junction issue_in occupation_or_discipline
|
||||
mental_or_behavioral_dysfunction process_of animal
|
||||
acquired_abnormality result_of physiologic_function
|
||||
acquired_abnormality result_of injury_or_poisoning
|
||||
idea_or_concept isa conceptual_entity
|
||||
molecular_function process_of archaeon
|
||||
anatomical_abnormality result_of organism_function
|
||||
health_care_related_organization location_of molecular_biology_research_technique
|
||||
eicosanoid causes neoplastic_process
|
||||
pathologic_function precedes neoplastic_process
|
||||
environmental_effect_of_humans result_of injury_or_poisoning
|
||||
element_ion_or_isotope causes neoplastic_process
|
||||
molecular_function affects vertebrate
|
||||
neuroreactive_substance_or_biogenic_amine causes acquired_abnormality
|
||||
steroid causes congenital_abnormality
|
||||
inorganic_chemical interacts_with chemical
|
||||
tissue produces nucleic_acid_nucleoside_or_nucleotide
|
||||
body_part_organ_or_organ_component location_of cell_function
|
||||
organism_attribute property_of animal
|
||||
eicosanoid interacts_with indicator_reagent_or_diagnostic_aid
|
||||
disease_or_syndrome affects mental_process
|
||||
cell_or_molecular_dysfunction process_of disease_or_syndrome
|
||||
pathologic_function result_of biologic_function
|
||||
finding manifestation_of mental_or_behavioral_dysfunction
|
||||
congenital_abnormality location_of bacterium
|
||||
biomedical_or_dental_material causes neoplastic_process
|
||||
chemical_viewed_functionally interacts_with biomedical_or_dental_material
|
||||
experimental_model_of_disease process_of disease_or_syndrome
|
||||
pathologic_function affects experimental_model_of_disease
|
||||
receptor complicates pathologic_function
|
||||
chemical_viewed_structurally affects experimental_model_of_disease
|
||||
fish exhibits individual_behavior
|
||||
immunologic_factor isa entity
|
||||
diagnostic_procedure measures molecular_function
|
||||
carbohydrate isa entity
|
||||
pathologic_function process_of plant
|
||||
amino_acid_sequence property_of gene_or_genome
|
||||
cell_or_molecular_dysfunction affects biologic_function
|
||||
food isa entity
|
||||
neoplastic_process process_of human
|
||||
hazardous_or_poisonous_substance complicates anatomical_abnormality
|
||||
body_location_or_region location_of disease_or_syndrome
|
||||
cell_function process_of animal
|
||||
natural_phenomenon_or_process affects organ_or_tissue_function
|
||||
neuroreactive_substance_or_biogenic_amine isa chemical_viewed_functionally
|
||||
organophosphorus_compound interacts_with element_ion_or_isotope
|
||||
genetic_function result_of disease_or_syndrome
|
||||
neoplastic_process process_of invertebrate
|
||||
laboratory_procedure assesses_effect_of experimental_model_of_disease
|
||||
alga isa organism
|
||||
clinical_attribute measurement_of organ_or_tissue_function
|
||||
human isa entity
|
||||
molecular_sequence isa idea_or_concept
|
||||
hazardous_or_poisonous_substance affects molecular_function
|
||||
amino_acid_peptide_or_protein isa chemical_viewed_structurally
|
||||
age_group issue_in biomedical_occupation_or_discipline
|
||||
laboratory_or_test_result measurement_of element_ion_or_isotope
|
||||
organization location_of laboratory_procedure
|
||||
steroid isa organic_chemical
|
||||
therapeutic_or_preventive_procedure affects disease_or_syndrome
|
||||
natural_phenomenon_or_process result_of organ_or_tissue_function
|
||||
chemical_viewed_functionally causes anatomical_abnormality
|
||||
geographic_area associated_with injury_or_poisoning
|
||||
carbohydrate_sequence result_of mental_process
|
||||
genetic_function result_of environmental_effect_of_humans
|
||||
biomedical_or_dental_material affects cell_or_molecular_dysfunction
|
||||
chemical_viewed_functionally affects pathologic_function
|
||||
molecular_function affects archaeon
|
||||
neoplastic_process manifestation_of organ_or_tissue_function
|
||||
tissue produces neuroreactive_substance_or_biogenic_amine
|
||||
indicator_reagent_or_diagnostic_aid causes cell_or_molecular_dysfunction
|
||||
laboratory_or_test_result evaluation_of mental_process
|
||||
biomedical_or_dental_material causes cell_or_molecular_dysfunction
|
||||
neoplastic_process result_of organ_or_tissue_function
|
||||
genetic_function produces neuroreactive_substance_or_biogenic_amine
|
||||
mental_or_behavioral_dysfunction result_of organ_or_tissue_function
|
||||
mental_process affects invertebrate
|
||||
indicator_reagent_or_diagnostic_aid affects natural_phenomenon_or_process
|
||||
mental_or_behavioral_dysfunction associated_with pathologic_function
|
||||
mental_process affects neoplastic_process
|
||||
cell_function affects biologic_function
|
||||
experimental_model_of_disease manifestation_of genetic_function
|
||||
inorganic_chemical causes congenital_abnormality
|
||||
laboratory_or_test_result measurement_of organic_chemical
|
||||
physical_object isa entity
|
||||
body_location_or_region location_of pathologic_function
|
||||
neuroreactive_substance_or_biogenic_amine complicates cell_function
|
||||
research_activity affects mental_process
|
||||
laboratory_procedure measures pathologic_function
|
||||
amino_acid_peptide_or_protein causes cell_or_molecular_dysfunction
|
||||
acquired_abnormality affects human
|
||||
diagnostic_procedure affects pathologic_function
|
||||
immunologic_factor complicates cell_or_molecular_dysfunction
|
||||
bacterium issue_in biomedical_occupation_or_discipline
|
||||
receptor complicates genetic_function
|
||||
neoplastic_process complicates experimental_model_of_disease
|
||||
organ_or_tissue_function affects cell_function
|
||||
therapeutic_or_preventive_procedure isa health_care_activity
|
||||
experimental_model_of_disease result_of social_behavior
|
||||
therapeutic_or_preventive_procedure method_of biomedical_occupation_or_discipline
|
||||
eicosanoid affects mental_process
|
||||
drug_delivery_device causes congenital_abnormality
|
||||
organism_function affects rickettsia_or_chlamydia
|
||||
mental_or_behavioral_dysfunction produces enzyme
|
||||
manufactured_object causes neoplastic_process
|
||||
chemical_viewed_structurally interacts_with immunologic_factor
|
||||
cell_function process_of fungus
|
||||
physiologic_function process_of invertebrate
|
||||
natural_phenomenon_or_process result_of congenital_abnormality
|
||||
vitamin complicates congenital_abnormality
|
||||
gene_or_genome part_of body_part_organ_or_organ_component
|
||||
disease_or_syndrome result_of phenomenon_or_process
|
||||
disease_or_syndrome affects animal
|
||||
patient_or_disabled_group performs occupational_activity
|
||||
organism_attribute result_of organism_function
|
||||
biologically_active_substance affects pathologic_function
|
||||
embryonic_structure location_of experimental_model_of_disease
|
||||
vitamin affects physiologic_function
|
||||
medical_device causes cell_or_molecular_dysfunction
|
||||
research_activity measures antibiotic
|
||||
drug_delivery_device treats acquired_abnormality
|
||||
organism isa physical_object
|
||||
molecular_function process_of fungus
|
||||
physiologic_function result_of organ_or_tissue_function
|
||||
antibiotic prevents disease_or_syndrome
|
||||
medical_device causes mental_or_behavioral_dysfunction
|
||||
nucleic_acid_nucleoside_or_nucleotide affects mental_process
|
||||
cell_or_molecular_dysfunction process_of physiologic_function
|
||||
chemical affects mental_or_behavioral_dysfunction
|
||||
nucleic_acid_nucleoside_or_nucleotide interacts_with neuroreactive_substance_or_biogenic_amine
|
||||
plant interacts_with bacterium
|
||||
organic_chemical interacts_with chemical_viewed_functionally
|
||||
experimental_model_of_disease associated_with clinical_attribute
|
||||
congenital_abnormality part_of organism
|
||||
gene_or_genome location_of experimental_model_of_disease
|
||||
body_part_organ_or_organ_component location_of fungus
|
||||
amino_acid_peptide_or_protein affects pathologic_function
|
||||
genetic_function produces hormone
|
||||
laboratory_procedure associated_with anatomical_abnormality
|
||||
antibiotic causes pathologic_function
|
||||
acquired_abnormality affects physiologic_function
|
||||
professional_or_occupational_group isa group
|
||||
sign_or_symptom associated_with acquired_abnormality
|
||||
enzyme causes congenital_abnormality
|
||||
genetic_function process_of cell_function
|
||||
vitamin complicates physiologic_function
|
||||
clinical_attribute measurement_of molecular_function
|
||||
embryonic_structure location_of mental_or_behavioral_dysfunction
|
||||
injury_or_poisoning result_of phenomenon_or_process
|
||||
chemical_viewed_structurally affects natural_phenomenon_or_process
|
||||
cell_function affects mental_or_behavioral_dysfunction
|
||||
mental_process affects social_behavior
|
||||
biologic_function process_of virus
|
||||
diagnostic_procedure analyzes indicator_reagent_or_diagnostic_aid
|
||||
experimental_model_of_disease affects physiologic_function
|
||||
virus location_of receptor
|
||||
qualitative_concept evaluation_of health_care_activity
|
||||
cell_function affects alga
|
||||
mental_or_behavioral_dysfunction process_of biologic_function
|
||||
mental_process process_of organ_or_tissue_function
|
||||
organ_or_tissue_function result_of injury_or_poisoning
|
||||
neoplastic_process precedes cell_or_molecular_dysfunction
|
||||
disease_or_syndrome degree_of mental_or_behavioral_dysfunction
|
||||
patient_or_disabled_group produces medical_device
|
||||
antibiotic interacts_with chemical
|
||||
disease_or_syndrome manifestation_of neoplastic_process
|
||||
cell_function process_of organism_function
|
||||
organism_attribute manifestation_of cell_function
|
||||
alga issue_in biomedical_occupation_or_discipline
|
||||
professional_society issue_in biomedical_occupation_or_discipline
|
||||
phenomenon_or_process result_of organism_function
|
||||
chemical affects organism_function
|
||||
laboratory_or_test_result manifestation_of organism_function
|
||||
congenital_abnormality affects organism_function
|
||||
daily_or_recreational_activity associated_with injury_or_poisoning
|
||||
laboratory_or_test_result measurement_of enzyme
|
||||
congenital_abnormality part_of bird
|
||||
neoplastic_process manifestation_of mental_process
|
||||
laboratory_procedure analyzes chemical_viewed_structurally
|
||||
disease_or_syndrome result_of biologic_function
|
||||
hormone disrupts cell
|
||||
cell_or_molecular_dysfunction manifestation_of molecular_function
|
||||
age_group produces regulation_or_law
|
||||
mental_process affects bird
|
||||
medical_device treats mental_or_behavioral_dysfunction
|
||||
phenomenon_or_process result_of mental_process
|
||||
embryonic_structure part_of virus
|
||||
molecular_function affects reptile
|
||||
therapeutic_or_preventive_procedure prevents experimental_model_of_disease
|
||||
lipid isa substance
|
||||
laboratory_procedure assesses_effect_of physiologic_function
|
||||
fish interacts_with organism
|
||||
plant isa physical_object
|
||||
gene_or_genome isa entity
|
||||
clinical_attribute property_of invertebrate
|
||||
diagnostic_procedure analyzes element_ion_or_isotope
|
||||
antibiotic affects natural_phenomenon_or_process
|
||||
gene_or_genome produces vitamin
|
||||
neoplastic_process affects natural_phenomenon_or_process
|
||||
neoplastic_process result_of health_care_activity
|
||||
diagnostic_procedure measures receptor
|
||||
bacterium interacts_with archaeon
|
||||
physiologic_function affects organism_attribute
|
||||
hormone interacts_with receptor
|
||||
professional_society carries_out laboratory_procedure
|
||||
cell location_of organ_or_tissue_function
|
||||
amino_acid_peptide_or_protein ingredient_of clinical_drug
|
||||
human_caused_phenomenon_or_process result_of natural_phenomenon_or_process
|
||||
research_activity issue_in occupation_or_discipline
|
||||
chemical_viewed_functionally causes acquired_abnormality
|
||||
reptile isa vertebrate
|
||||
biologic_function affects invertebrate
|
||||
neoplastic_process affects organism
|
||||
vitamin affects natural_phenomenon_or_process
|
||||
antibiotic diagnoses disease_or_syndrome
|
||||
acquired_abnormality manifestation_of physiologic_function
|
||||
pharmacologic_substance isa chemical
|
||||
age_group exhibits social_behavior
|
||||
organism_function process_of animal
|
||||
professional_or_occupational_group performs machine_activity
|
||||
experimental_model_of_disease isa event
|
||||
neoplastic_process process_of disease_or_syndrome
|
||||
acquired_abnormality location_of disease_or_syndrome
|
||||
event issue_in biomedical_occupation_or_discipline
|
||||
mental_or_behavioral_dysfunction occurs_in professional_or_occupational_group
|
||||
indicator_reagent_or_diagnostic_aid affects experimental_model_of_disease
|
||||
mental_or_behavioral_dysfunction isa biologic_function
|
||||
health_care_activity method_of occupation_or_discipline
|
||||
element_ion_or_isotope affects experimental_model_of_disease
|
||||
plant interacts_with fungus
|
||||
patient_or_disabled_group issue_in occupation_or_discipline
|
||||
self_help_or_relief_organization carries_out occupational_activity
|
||||
research_activity measures molecular_function
|
||||
acquired_abnormality part_of amphibian
|
||||
receptor affects mental_process
|
||||
nucleic_acid_nucleoside_or_nucleotide causes injury_or_poisoning
|
||||
cell_or_molecular_dysfunction affects organ_or_tissue_function
|
||||
organism_attribute result_of experimental_model_of_disease
|
||||
pathologic_function affects bacterium
|
||||
professional_society location_of health_care_activity
|
||||
hazardous_or_poisonous_substance disrupts embryonic_structure
|
||||
animal exhibits social_behavior
|
||||
biologic_function result_of congenital_abnormality
|
||||
pathologic_function affects mental_process
|
||||
diagnostic_procedure measures amino_acid_peptide_or_protein
|
||||
molecular_function co-occurs_with physiologic_function
|
||||
family_group uses medical_device
|
||||
group performs machine_activity
|
||||
laboratory_procedure associated_with pathologic_function
|
||||
neoplastic_process co-occurs_with congenital_abnormality
|
||||
laboratory_procedure measures indicator_reagent_or_diagnostic_aid
|
||||
anatomical_abnormality result_of pathologic_function
|
||||
body_location_or_region location_of cell_function
|
||||
research_activity measures steroid
|
||||
invertebrate causes neoplastic_process
|
||||
laboratory_procedure analyzes hormone
|
||||
disease_or_syndrome affects biologic_function
|
||||
pathologic_function affects genetic_function
|
||||
tissue issue_in occupation_or_discipline
|
||||
biologic_function affects plant
|
||||
anatomical_abnormality affects reptile
|
||||
body_location_or_region location_of mental_or_behavioral_dysfunction
|
||||
medical_device treats pathologic_function
|
||||
organism_attribute result_of cell_function
|
||||
gene_or_genome location_of virus
|
||||
gene_or_genome part_of tissue
|
||||
tissue produces hormone
|
||||
laboratory_or_test_result indicates neoplastic_process
|
||||
mental_or_behavioral_dysfunction complicates injury_or_poisoning
|
||||
biologically_active_substance causes experimental_model_of_disease
|
||||
therapeutic_or_preventive_procedure issue_in biomedical_occupation_or_discipline
|
||||
quantitative_concept measurement_of body_location_or_region
|
||||
professional_or_occupational_group isa entity
|
||||
gene_or_genome affects organ_or_tissue_function
|
||||
eicosanoid affects disease_or_syndrome
|
||||
immunologic_factor complicates organism_function
|
||||
gene_or_genome part_of reptile
|
||||
laboratory_or_test_result manifestation_of molecular_function
|
||||
mental_or_behavioral_dysfunction occurs_in family_group
|
||||
therapeutic_or_preventive_procedure treats mental_or_behavioral_dysfunction
|
||||
population_group isa group
|
||||
body_location_or_region location_of tissue
|
||||
quantitative_concept measurement_of molecular_sequence
|
||||
laboratory_procedure isa activity
|
||||
diagnostic_procedure assesses_effect_of organophosphorus_compound
|
||||
gene_or_genome issue_in occupation_or_discipline
|
||||
organ_or_tissue_function process_of reptile
|
||||
geographic_area isa conceptual_entity
|
||||
neuroreactive_substance_or_biogenic_amine affects mental_or_behavioral_dysfunction
|
||||
biologically_active_substance isa chemical
|
||||
enzyme disrupts embryonic_structure
|
||||
virus location_of vitamin
|
||||
professional_or_occupational_group uses regulation_or_law
|
||||
experimental_model_of_disease result_of therapeutic_or_preventive_procedure
|
||||
indicator_reagent_or_diagnostic_aid causes neoplastic_process
|
||||
sign_or_symptom evaluation_of biologic_function
|
||||
physiologic_function process_of amphibian
|
||||
classification issue_in biomedical_occupation_or_discipline
|
||||
organism_function produces biologically_active_substance
|
||||
laboratory_or_test_result measurement_of chemical
|
||||
immunologic_factor disrupts body_part_organ_or_organ_component
|
||||
health_care_activity issue_in biomedical_occupation_or_discipline
|
||||
carbohydrate interacts_with antibiotic
|
||||
neoplastic_process result_of diagnostic_procedure
|
||||
mental_or_behavioral_dysfunction result_of organism_function
|
||||
cell_component location_of organ_or_tissue_function
|
||||
organophosphorus_compound issue_in occupation_or_discipline
|
||||
cell_component location_of experimental_model_of_disease
|
||||
lipid causes acquired_abnormality
|
||||
experimental_model_of_disease result_of mental_process
|
||||
anatomical_abnormality result_of cell_or_molecular_dysfunction
|
||||
cell_function isa physiologic_function
|
||||
acquired_abnormality manifestation_of cell_function
|
||||
laboratory_or_test_result associated_with disease_or_syndrome
|
||||
mental_process produces hormone
|
||||
mammal exhibits behavior
|
||||
daily_or_recreational_activity associated_with neoplastic_process
|
||||
clinical_drug causes injury_or_poisoning
|
||||
research_activity associated_with pathologic_function
|
||||
cell_or_molecular_dysfunction process_of human
|
||||
body_part_organ_or_organ_component part_of invertebrate
|
||||
drug_delivery_device treats sign_or_symptom
|
||||
neuroreactive_substance_or_biogenic_amine affects disease_or_syndrome
|
||||
vertebrate isa physical_object
|
||||
experimental_model_of_disease result_of diagnostic_procedure
|
||||
drug_delivery_device isa entity
|
||||
therapeutic_or_preventive_procedure uses clinical_drug
|
||||
enzyme affects cell_or_molecular_dysfunction
|
||||
diagnostic_procedure analyzes neuroreactive_substance_or_biogenic_amine
|
||||
amphibian exhibits individual_behavior
|
||||
mental_or_behavioral_dysfunction process_of physiologic_function
|
||||
laboratory_procedure diagnoses cell_or_molecular_dysfunction
|
||||
therapeutic_or_preventive_procedure complicates mental_process
|
||||
steroid interacts_with inorganic_chemical
|
||||
physiologic_function affects plant
|
||||
biomedical_occupation_or_discipline isa conceptual_entity
|
||||
laboratory_procedure analyzes carbohydrate
|
||||
eicosanoid interacts_with receptor
|
||||
age_group performs molecular_biology_research_technique
|
||||
element_ion_or_isotope interacts_with enzyme
|
||||
hazardous_or_poisonous_substance disrupts cell_component
|
||||
congenital_abnormality result_of physiologic_function
|
||||
organophosphorus_compound interacts_with neuroreactive_substance_or_biogenic_amine
|
||||
anatomical_abnormality part_of bacterium
|
||||
clinical_drug causes anatomical_abnormality
|
||||
body_space_or_junction issue_in biomedical_occupation_or_discipline
|
||||
therapeutic_or_preventive_procedure affects mental_process
|
||||
health_care_activity associated_with injury_or_poisoning
|
||||
molecular_function precedes organ_or_tissue_function
|
||||
health_care_related_organization carries_out research_activity
|
||||
cell_function process_of molecular_function
|
||||
neoplastic_process affects experimental_model_of_disease
|
||||
diagnostic_procedure affects cell_or_molecular_dysfunction
|
||||
diagnostic_procedure issue_in occupation_or_discipline
|
||||
governmental_or_regulatory_activity method_of biomedical_occupation_or_discipline
|
||||
laboratory_or_test_result manifestation_of cell_function
|
||||
professional_or_occupational_group produces regulation_or_law
|
||||
laboratory_or_test_result measurement_of pharmacologic_substance
|
||||
pharmacologic_substance affects experimental_model_of_disease
|
||||
receptor affects cell_function
|
||||
neuroreactive_substance_or_biogenic_amine causes anatomical_abnormality
|
||||
body_part_organ_or_organ_component produces vitamin
|
||||
hormone affects biologic_function
|
||||
fully_formed_anatomical_structure location_of disease_or_syndrome
|
||||
receptor affects physiologic_function
|
||||
research_activity measures organism_attribute
|
||||
finding manifestation_of organ_or_tissue_function
|
||||
mental_or_behavioral_dysfunction manifestation_of physiologic_function
|
||||
health_care_activity affects mental_or_behavioral_dysfunction
|
||||
antibiotic interacts_with immunologic_factor
|
||||
disease_or_syndrome produces body_substance
|
||||
diagnostic_procedure measures biomedical_or_dental_material
|
||||
chemical affects natural_phenomenon_or_process
|
||||
research_activity measures biomedical_or_dental_material
|
||||
body_part_organ_or_organ_component conceptual_part_of body_system
|
||||
disease_or_syndrome affects bacterium
|
||||
chemical causes anatomical_abnormality
|
||||
organism_function result_of mental_process
|
||||
cell_or_molecular_dysfunction occurs_in age_group
|
||||
pathologic_function affects amphibian
|
||||
molecular_function isa phenomenon_or_process
|
||||
laboratory_procedure analyzes vitamin
|
||||
governmental_or_regulatory_activity associated_with pathologic_function
|
||||
mental_process result_of acquired_abnormality
|
||||
tissue produces organophosphorus_compound
|
||||
gene_or_genome part_of cell_component
|
||||
mental_or_behavioral_dysfunction affects animal
|
||||
immunologic_factor causes acquired_abnormality
|
||||
antibiotic treats acquired_abnormality
|
||||
eicosanoid isa lipid
|
||||
neuroreactive_substance_or_biogenic_amine causes pathologic_function
|
||||
antibiotic treats congenital_abnormality
|
||||
acquired_abnormality part_of plant
|
||||
mental_or_behavioral_dysfunction process_of mental_process
|
||||
professional_or_occupational_group exhibits individual_behavior
|
||||
cell_component location_of biologic_function
|
||||
hazardous_or_poisonous_substance isa chemical_viewed_functionally
|
||||
cell_function result_of molecular_function
|
||||
element_ion_or_isotope ingredient_of clinical_drug
|
||||
acquired_abnormality affects amphibian
|
||||
group uses classification
|
||||
organic_chemical interacts_with eicosanoid
|
||||
receptor isa biologically_active_substance
|
||||
biologically_active_substance affects molecular_function
|
||||
pathologic_function precedes mental_or_behavioral_dysfunction
|
||||
laboratory_procedure assesses_effect_of biologically_active_substance
|
||||
cell_function produces hormone
|
||||
biologically_active_substance disrupts embryonic_structure
|
||||
biologic_function produces receptor
|
||||
alga location_of hormone
|
||||
experimental_model_of_disease produces receptor
|
||||
organ_or_tissue_function occurs_in mental_process
|
||||
nucleic_acid_nucleoside_or_nucleotide affects molecular_function
|
||||
acquired_abnormality part_of rickettsia_or_chlamydia
|
||||
medical_device treats experimental_model_of_disease
|
||||
neoplastic_process process_of experimental_model_of_disease
|
||||
geographic_area associated_with cell_or_molecular_dysfunction
|
||||
organophosphorus_compound interacts_with steroid
|
||||
cell_function isa natural_phenomenon_or_process
|
||||
disease_or_syndrome result_of social_behavior
|
||||
mental_or_behavioral_dysfunction occurs_in patient_or_disabled_group
|
||||
injury_or_poisoning occurs_in professional_or_occupational_group
|
||||
hazardous_or_poisonous_substance complicates congenital_abnormality
|
||||
invertebrate causes pathologic_function
|
||||
acquired_abnormality occurs_in professional_or_occupational_group
|
||||
lipid affects mental_or_behavioral_dysfunction
|
||||
clinical_attribute associated_with organism_attribute
|
||||
lipid affects mental_process
|
||||
invertebrate interacts_with reptile
|
||||
gene_or_genome part_of vertebrate
|
||||
organ_or_tissue_function process_of mammal
|
||||
body_substance conceptual_part_of body_system
|
||||
body_part_organ_or_organ_component produces neuroreactive_substance_or_biogenic_amine
|
||||
carbohydrate interacts_with inorganic_chemical
|
||||
anatomical_abnormality part_of mammal
|
||||
natural_phenomenon_or_process affects molecular_function
|
||||
substance causes cell_or_molecular_dysfunction
|
||||
embryonic_structure surrounds cell
|
||||
injury_or_poisoning isa phenomenon_or_process
|
||||
diagnostic_procedure diagnoses anatomical_abnormality
|
||||
body_space_or_junction location_of injury_or_poisoning
|
||||
cell_function result_of experimental_model_of_disease
|
||||
neuroreactive_substance_or_biogenic_amine complicates genetic_function
|
||||
experimental_model_of_disease result_of environmental_effect_of_humans
|
||||
health_care_activity affects cell_or_molecular_dysfunction
|
||||
professional_society carries_out diagnostic_procedure
|
||||
health_care_activity affects mental_process
|
||||
group produces research_device
|
||||
cell_component location_of congenital_abnormality
|
||||
vertebrate isa animal
|
||||
molecular_biology_research_technique measures biomedical_or_dental_material
|
||||
professional_society produces classification
|
||||
amino_acid_sequence isa idea_or_concept
|
||||
genetic_function co-occurs_with physiologic_function
|
||||
mental_or_behavioral_dysfunction manifestation_of genetic_function
|
||||
biologic_function process_of mammal
|
||||
individual_behavior affects social_behavior
|
||||
pathologic_function co-occurs_with injury_or_poisoning
|
||||
invertebrate causes experimental_model_of_disease
|
||||
fish interacts_with archaeon
|
||||
research_device causes disease_or_syndrome
|
||||
quantitative_concept issue_in biomedical_occupation_or_discipline
|
||||
professional_society location_of therapeutic_or_preventive_procedure
|
||||
drug_delivery_device prevents disease_or_syndrome
|
||||
fully_formed_anatomical_structure part_of invertebrate
|
||||
mammal isa entity
|
||||
body_part_organ_or_organ_component produces receptor
|
||||
molecular_function affects mammal
|
||||
laboratory_procedure analyzes biomedical_or_dental_material
|
||||
human_caused_phenomenon_or_process isa phenomenon_or_process
|
||||
experimental_model_of_disease process_of vertebrate
|
||||
professional_society carries_out research_activity
|
||||
experimental_model_of_disease precedes cell_or_molecular_dysfunction
|
||||
experimental_model_of_disease affects amphibian
|
||||
laboratory_procedure assesses_effect_of hazardous_or_poisonous_substance
|
||||
anatomical_abnormality issue_in biomedical_occupation_or_discipline
|
||||
hormone affects mental_process
|
||||
laboratory_procedure analyzes pharmacologic_substance
|
||||
body_location_or_region location_of genetic_function
|
||||
disease_or_syndrome result_of injury_or_poisoning
|
||||
laboratory_procedure assesses_effect_of neoplastic_process
|
||||
congenital_abnormality affects animal
|
||||
biomedical_or_dental_material interacts_with immunologic_factor
|
||||
organism_function isa natural_phenomenon_or_process
|
||||
classification isa intellectual_product
|
||||
natural_phenomenon_or_process result_of anatomical_abnormality
|
||||
chemical_viewed_functionally affects neoplastic_process
|
||||
amino_acid_sequence result_of mental_process
|
||||
clinical_attribute property_of reptile
|
||||
mammal exhibits individual_behavior
|
||||
natural_phenomenon_or_process affects disease_or_syndrome
|
||||
organ_or_tissue_function process_of neoplastic_process
|
||||
biologically_active_substance complicates mental_process
|
||||
laboratory_procedure assesses_effect_of biomedical_or_dental_material
|
||||
biomedical_or_dental_material interacts_with chemical
|
||||
neoplastic_process associated_with cell_or_molecular_dysfunction
|
||||
qualitative_concept isa idea_or_concept
|
||||
sign_or_symptom evaluation_of experimental_model_of_disease
|
||||
neuroreactive_substance_or_biogenic_amine interacts_with receptor
|
||||
cell location_of pathologic_function
|
||||
diagnostic_procedure assesses_effect_of enzyme
|
||||
acquired_abnormality part_of alga
|
||||
organophosphorus_compound interacts_with hazardous_or_poisonous_substance
|
||||
diagnostic_procedure assesses_effect_of lipid
|
||||
fungus interacts_with invertebrate
|
||||
laboratory_or_test_result measurement_of physiologic_function
|
||||
acquired_abnormality affects mental_process
|
||||
disease_or_syndrome affects reptile
|
||||
amino_acid_sequence isa entity
|
||||
mental_process result_of biologic_function
|
||||
organic_chemical affects biologic_function
|
||||
steroid interacts_with hormone
|
||||
pathologic_function result_of acquired_abnormality
|
||||
research_activity measures chemical_viewed_structurally
|
||||
therapeutic_or_preventive_procedure associated_with mental_or_behavioral_dysfunction
|
||||
physiologic_function result_of mental_process
|
||||
clinical_attribute result_of human_caused_phenomenon_or_process
|
||||
laboratory_procedure measures antibiotic
|
||||
cell part_of invertebrate
|
||||
vitamin complicates cell_or_molecular_dysfunction
|
||||
clinical_attribute manifestation_of molecular_function
|
||||
organism_function result_of acquired_abnormality
|
||||
professional_or_occupational_group interacts_with age_group
|
||||
natural_phenomenon_or_process affects neoplastic_process
|
||||
organization carries_out research_activity
|
||||
embryonic_structure part_of bacterium
|
||||
fully_formed_anatomical_structure produces enzyme
|
||||
organic_chemical interacts_with indicator_reagent_or_diagnostic_aid
|
||||
natural_phenomenon_or_process result_of human_caused_phenomenon_or_process
|
||||
neoplastic_process affects pathologic_function
|
||||
fully_formed_anatomical_structure issue_in biomedical_occupation_or_discipline
|
||||
environmental_effect_of_humans result_of experimental_model_of_disease
|
||||
experimental_model_of_disease manifestation_of physiologic_function
|
||||
body_part_organ_or_organ_component location_of mental_process
|
||||
receptor causes injury_or_poisoning
|
||||
sign_or_symptom diagnoses disease_or_syndrome
|
||||
antibiotic disrupts mental_process
|
||||
mental_process precedes organism_function
|
||||
chemical_viewed_structurally affects cell_or_molecular_dysfunction
|
||||
vitamin disrupts molecular_function
|
||||
pharmacologic_substance causes injury_or_poisoning
|
||||
professional_or_occupational_group performs governmental_or_regulatory_activity
|
||||
educational_activity isa activity
|
||||
congenital_abnormality location_of disease_or_syndrome
|
||||
neoplastic_process co-occurs_with pathologic_function
|
||||
chemical_viewed_functionally causes mental_or_behavioral_dysfunction
|
||||
biologic_function process_of human
|
||||
hormone complicates mental_or_behavioral_dysfunction
|
||||
embryonic_structure location_of rickettsia_or_chlamydia
|
||||
congenital_abnormality result_of mental_or_behavioral_dysfunction
|
||||
organ_or_tissue_function produces enzyme
|
||||
molecular_biology_research_technique measures experimental_model_of_disease
|
||||
disease_or_syndrome process_of organism_function
|
||||
finding manifestation_of disease_or_syndrome
|
||||
pathologic_function process_of mammal
|
||||
organ_or_tissue_function process_of human
|
||||
indicator_reagent_or_diagnostic_aid affects physiologic_function
|
||||
health_care_related_organization carries_out molecular_biology_research_technique
|
||||
hazardous_or_poisonous_substance disrupts organ_or_tissue_function
|
||||
mental_process process_of invertebrate
|
||||
tissue location_of experimental_model_of_disease
|
||||
antibiotic isa pharmacologic_substance
|
||||
therapeutic_or_preventive_procedure prevents mental_or_behavioral_dysfunction
|
||||
steroid affects disease_or_syndrome
|
||||
pharmacologic_substance prevents disease_or_syndrome
|
||||
behavior result_of mental_process
|
||||
social_behavior associated_with geographic_area
|
||||
tissue part_of body_part_organ_or_organ_component
|
||||
molecular_function affects rickettsia_or_chlamydia
|
||||
population_group performs governmental_or_regulatory_activity
|
||||
biologically_active_substance disrupts organism_function
|
||||
acquired_abnormality isa anatomical_abnormality
|
||||
molecular_function affects alga
|
||||
congenital_abnormality result_of human_caused_phenomenon_or_process
|
||||
congenital_abnormality result_of environmental_effect_of_humans
|
||||
neoplastic_process process_of mental_or_behavioral_dysfunction
|
||||
functional_concept isa entity
|
||||
spatial_concept isa conceptual_entity
|
||||
mental_or_behavioral_dysfunction process_of cell_or_molecular_dysfunction
|
||||
biomedical_or_dental_material causes anatomical_abnormality
|
||||
hazardous_or_poisonous_substance causes congenital_abnormality
|
||||
antibiotic disrupts cell
|
||||
disease_or_syndrome affects alga
|
||||
finding manifestation_of experimental_model_of_disease
|
||||
element_ion_or_isotope affects natural_phenomenon_or_process
|
||||
amphibian interacts_with archaeon
|
||||
body_space_or_junction location_of mental_process
|
||||
substance causes neoplastic_process
|
||||
biologic_function affects genetic_function
|
||||
indicator_reagent_or_diagnostic_aid causes injury_or_poisoning
|
||||
research_activity measures pharmacologic_substance
|
||||
injury_or_poisoning result_of environmental_effect_of_humans
|
||||
organization issue_in occupation_or_discipline
|
||||
organ_or_tissue_function process_of mental_process
|
||||
research_activity associated_with mental_or_behavioral_dysfunction
|
||||
human issue_in biomedical_occupation_or_discipline
|
||||
molecular_function affects disease_or_syndrome
|
||||
eicosanoid affects pathologic_function
|
|
135
dataset/umls/entities.txt
Normal file
135
dataset/umls/entities.txt
Normal file
@ -0,0 +1,135 @@
|
||||
idea_or_concept
|
||||
virus
|
||||
spatial_concept
|
||||
human_caused_phenomenon_or_process
|
||||
human
|
||||
organ_or_tissue_function
|
||||
daily_or_recreational_activity
|
||||
steroid
|
||||
biomedical_or_dental_material
|
||||
vertebrate
|
||||
immunologic_factor
|
||||
inorganic_chemical
|
||||
invertebrate
|
||||
embryonic_structure
|
||||
functional_concept
|
||||
amino_acid_peptide_or_protein
|
||||
fish
|
||||
reptile
|
||||
physical_object
|
||||
disease_or_syndrome
|
||||
biologically_active_substance
|
||||
physiologic_function
|
||||
population_group
|
||||
group
|
||||
body_space_or_junction
|
||||
bird
|
||||
qualitative_concept
|
||||
bacterium
|
||||
cell_function
|
||||
enzyme
|
||||
organophosphorus_compound
|
||||
nucleic_acid_nucleoside_or_nucleotide
|
||||
cell
|
||||
language
|
||||
antibiotic
|
||||
indicator_reagent_or_diagnostic_aid
|
||||
fungus
|
||||
chemical_viewed_functionally
|
||||
rickettsia_or_chlamydia
|
||||
patient_or_disabled_group
|
||||
professional_society
|
||||
health_care_related_organization
|
||||
clinical_attribute
|
||||
biomedical_occupation_or_discipline
|
||||
temporal_concept
|
||||
phenomenon_or_process
|
||||
family_group
|
||||
chemical_viewed_structurally
|
||||
regulation_or_law
|
||||
acquired_abnormality
|
||||
experimental_model_of_disease
|
||||
professional_or_occupational_group
|
||||
injury_or_poisoning
|
||||
receptor
|
||||
drug_delivery_device
|
||||
hazardous_or_poisonous_substance
|
||||
organism
|
||||
neoplastic_process
|
||||
mammal
|
||||
molecular_function
|
||||
lipid
|
||||
group_attribute
|
||||
nucleotide_sequence
|
||||
biologic_function
|
||||
chemical
|
||||
cell_component
|
||||
intellectual_product
|
||||
manufactured_object
|
||||
classification
|
||||
geographic_area
|
||||
vitamin
|
||||
gene_or_genome
|
||||
self_help_or_relief_organization
|
||||
pathologic_function
|
||||
amphibian
|
||||
laboratory_or_test_result
|
||||
organism_attribute
|
||||
cell_or_molecular_dysfunction
|
||||
therapeutic_or_preventive_procedure
|
||||
sign_or_symptom
|
||||
occupational_activity
|
||||
anatomical_abnormality
|
||||
hormone
|
||||
fully_formed_anatomical_structure
|
||||
educational_activity
|
||||
quantitative_concept
|
||||
tissue
|
||||
organism_function
|
||||
social_behavior
|
||||
mental_or_behavioral_dysfunction
|
||||
governmental_or_regulatory_activity
|
||||
molecular_biology_research_technique
|
||||
occupation_or_discipline
|
||||
conceptual_entity
|
||||
body_location_or_region
|
||||
pharmacologic_substance
|
||||
clinical_drug
|
||||
food
|
||||
substance
|
||||
genetic_function
|
||||
congenital_abnormality
|
||||
medical_device
|
||||
carbohydrate
|
||||
health_care_activity
|
||||
eicosanoid
|
||||
element_ion_or_isotope
|
||||
diagnostic_procedure
|
||||
entity
|
||||
event
|
||||
laboratory_procedure
|
||||
environmental_effect_of_humans
|
||||
body_part_organ_or_organ_component
|
||||
molecular_sequence
|
||||
mental_process
|
||||
research_device
|
||||
alga
|
||||
natural_phenomenon_or_process
|
||||
anatomical_structure
|
||||
animal
|
||||
body_system
|
||||
behavior
|
||||
carbohydrate_sequence
|
||||
archaeon
|
||||
research_activity
|
||||
organization
|
||||
individual_behavior
|
||||
organic_chemical
|
||||
finding
|
||||
age_group
|
||||
activity
|
||||
machine_activity
|
||||
plant
|
||||
body_substance
|
||||
amino_acid_sequence
|
||||
neuroreactive_substance_or_biogenic_amine
|
135
dataset/umls/entity2text.txt
Normal file
135
dataset/umls/entity2text.txt
Normal file
@ -0,0 +1,135 @@
|
||||
idea_or_concept idea or concept
|
||||
virus virus
|
||||
spatial_concept spatial concept
|
||||
human_caused_phenomenon_or_process human caused phenomenon or process
|
||||
human human
|
||||
organ_or_tissue_function organ or tissue function
|
||||
daily_or_recreational_activity daily or recreational activity
|
||||
steroid steroid
|
||||
biomedical_or_dental_material biomedical or dental material
|
||||
vertebrate vertebrate
|
||||
immunologic_factor immunologic factor
|
||||
inorganic_chemical inorganic chemical
|
||||
invertebrate invertebrate
|
||||
embryonic_structure embryonic structure
|
||||
functional_concept functional concept
|
||||
amino_acid_peptide_or_protein amino acid peptide or protein
|
||||
fish fish
|
||||
reptile reptile
|
||||
physical_object physical object
|
||||
disease_or_syndrome disease or syndrome
|
||||
biologically_active_substance biologically active substance
|
||||
physiologic_function physiologic function
|
||||
population_group population group
|
||||
group group
|
||||
body_space_or_junction body space or junction
|
||||
bird bird
|
||||
qualitative_concept qualitative concept
|
||||
bacterium bacterium
|
||||
cell_function cell function
|
||||
enzyme enzyme
|
||||
organophosphorus_compound organophosphorus compound
|
||||
nucleic_acid_nucleoside_or_nucleotide nucleic acid nucleoside or nucleotide
|
||||
cell cell
|
||||
language language
|
||||
antibiotic antibiotic
|
||||
indicator_reagent_or_diagnostic_aid indicator reagent or diagnostic aid
|
||||
fungus fungus
|
||||
chemical_viewed_functionally chemical viewed functionally
|
||||
rickettsia_or_chlamydia rickettsia or chlamydia
|
||||
patient_or_disabled_group patient or disabled group
|
||||
professional_society professional society
|
||||
health_care_related_organization health care related organization
|
||||
clinical_attribute clinical attribute
|
||||
biomedical_occupation_or_discipline biomedical occupation or discipline
|
||||
temporal_concept temporal concept
|
||||
phenomenon_or_process phenomenon or process
|
||||
family_group family group
|
||||
chemical_viewed_structurally chemical viewed structurally
|
||||
regulation_or_law regulation or law
|
||||
acquired_abnormality acquired abnormality
|
||||
experimental_model_of_disease experimental model of disease
|
||||
professional_or_occupational_group professional or occupational group
|
||||
injury_or_poisoning injury or poisoning
|
||||
receptor receptor
|
||||
drug_delivery_device drug delivery device
|
||||
hazardous_or_poisonous_substance hazardous or poisonous substance
|
||||
organism organism
|
||||
neoplastic_process neoplastic process
|
||||
mammal mammal
|
||||
molecular_function molecular function
|
||||
lipid lipid
|
||||
group_attribute group attribute
|
||||
nucleotide_sequence nucleotide sequence
|
||||
biologic_function biologic function
|
||||
chemical chemical
|
||||
cell_component cell component
|
||||
intellectual_product intellectual product
|
||||
manufactured_object manufactured object
|
||||
classification classification
|
||||
geographic_area geographic area
|
||||
vitamin vitamin
|
||||
gene_or_genome gene or genome
|
||||
self_help_or_relief_organization self help or relief organization
|
||||
pathologic_function pathologic function
|
||||
amphibian amphibian
|
||||
laboratory_or_test_result laboratory or test result
|
||||
organism_attribute organism attribute
|
||||
cell_or_molecular_dysfunction cell or molecular dysfunction
|
||||
therapeutic_or_preventive_procedure therapeutic or preventive procedure
|
||||
sign_or_symptom sign or symptom
|
||||
occupational_activity occupational activity
|
||||
anatomical_abnormality anatomical abnormality
|
||||
hormone hormone
|
||||
fully_formed_anatomical_structure fully formed anatomical structure
|
||||
educational_activity educational activity
|
||||
quantitative_concept quantitative concept
|
||||
tissue tissue
|
||||
organism_function organism function
|
||||
social_behavior social behavior
|
||||
mental_or_behavioral_dysfunction mental or behavioral dysfunction
|
||||
governmental_or_regulatory_activity governmental or regulatory activity
|
||||
molecular_biology_research_technique molecular biology research technique
|
||||
occupation_or_discipline occupation or discipline
|
||||
conceptual_entity conceptual entity
|
||||
body_location_or_region body location or region
|
||||
pharmacologic_substance pharmacologic substance
|
||||
clinical_drug clinical drug
|
||||
food food
|
||||
substance substance
|
||||
genetic_function genetic function
|
||||
congenital_abnormality congenital abnormality
|
||||
medical_device medical device
|
||||
carbohydrate carbohydrate
|
||||
health_care_activity health care activity
|
||||
eicosanoid eicosanoid
|
||||
element_ion_or_isotope element ion or isotope
|
||||
diagnostic_procedure diagnostic procedure
|
||||
entity entity
|
||||
event event
|
||||
laboratory_procedure laboratory procedure
|
||||
environmental_effect_of_humans environmental effect of humans
|
||||
body_part_organ_or_organ_component body part organ or organ component
|
||||
molecular_sequence molecular sequence
|
||||
mental_process mental process
|
||||
research_device research device
|
||||
alga alga
|
||||
natural_phenomenon_or_process natural phenomenon or process
|
||||
anatomical_structure anatomical structure
|
||||
animal animal
|
||||
body_system body system
|
||||
behavior behavior
|
||||
carbohydrate_sequence carbohydrate sequence
|
||||
archaeon archaeon
|
||||
research_activity research activity
|
||||
organization organization
|
||||
individual_behavior individual behavior
|
||||
organic_chemical organic chemical
|
||||
finding finding
|
||||
age_group age group
|
||||
activity activity
|
||||
machine_activity machine activity
|
||||
plant plant
|
||||
body_substance body substance
|
||||
amino_acid_sequence amino acid sequence
|
||||
neuroreactive_substance_or_biogenic_amine neuroreactive substance or biogenic amine
|
135
dataset/umls/entity2textlong.txt
Normal file
135
dataset/umls/entity2textlong.txt
Normal file
File diff suppressed because one or more lines are too long
155
dataset/umls/get_neighbor.ipynb
Normal file
155
dataset/umls/get_neighbor.ipynb
Normal file
@ -0,0 +1,155 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path1 = './entities.txt'\n",
|
||||
"path2 = './relations.txt'\n",
|
||||
"path3 = './train.tsv'\n",
|
||||
"path4 = './dev.tsv'\n",
|
||||
"path5 = './test.tsv'\n",
|
||||
"path6 = './get_neighbor/entity2id.txt'\n",
|
||||
"path7 = './get_neighbor/relation2id.txt'\n",
|
||||
"path8 = './get_neighbor/train2id.txt'\n",
|
||||
"path9 = './get_neighbor/valid2id.txt'\n",
|
||||
"path10 = './get_neighbor/test2id.txt'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path1, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"cnt = 0\n",
|
||||
"with open(path6, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" en = line.strip()\n",
|
||||
" f.write(en + '\\t' + str(cnt) + '\\n')\n",
|
||||
" cnt += 1\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path2, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"cnt = 0\n",
|
||||
"with open(path7, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" re = line.strip()\n",
|
||||
" f.write(re + '\\t' + str(cnt) + '\\n')\n",
|
||||
" cnt += 1\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path6, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"en2id = {}\n",
|
||||
"for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" en, num = b[0], b[1]\n",
|
||||
" en2id[en] = num"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path7, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"re2id = {}\n",
|
||||
"for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" re, num = b[0], b[1]\n",
|
||||
" re2id[re] = num"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path3, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"with open(path8, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" h, r, t = b[0], b[1], b[2]\n",
|
||||
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path4, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"with open(path9, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" h, r, t = b[0], b[1], b[2]\n",
|
||||
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(path5, 'r') as f:\n",
|
||||
" a = f.readlines()\n",
|
||||
"with open(path10, 'w') as f:\n",
|
||||
" for line in a:\n",
|
||||
" b = line.strip().split('\\t')\n",
|
||||
" h, r, t = b[0], b[1], b[2]\n",
|
||||
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
|
||||
" "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python [default]",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
135
dataset/umls/get_neighbor/entity2id.txt
Normal file
135
dataset/umls/get_neighbor/entity2id.txt
Normal file
@ -0,0 +1,135 @@
|
||||
idea_or_concept 0
|
||||
virus 1
|
||||
spatial_concept 2
|
||||
human_caused_phenomenon_or_process 3
|
||||
human 4
|
||||
organ_or_tissue_function 5
|
||||
daily_or_recreational_activity 6
|
||||
steroid 7
|
||||
biomedical_or_dental_material 8
|
||||
vertebrate 9
|
||||
immunologic_factor 10
|
||||
inorganic_chemical 11
|
||||
invertebrate 12
|
||||
embryonic_structure 13
|
||||
functional_concept 14
|
||||
amino_acid_peptide_or_protein 15
|
||||
fish 16
|
||||
reptile 17
|
||||
physical_object 18
|
||||
disease_or_syndrome 19
|
||||
biologically_active_substance 20
|
||||
physiologic_function 21
|
||||
population_group 22
|
||||
group 23
|
||||
body_space_or_junction 24
|
||||
bird 25
|
||||
qualitative_concept 26
|
||||
bacterium 27
|
||||
cell_function 28
|
||||
enzyme 29
|
||||
organophosphorus_compound 30
|
||||
nucleic_acid_nucleoside_or_nucleotide 31
|
||||
cell 32
|
||||
language 33
|
||||
antibiotic 34
|
||||
indicator_reagent_or_diagnostic_aid 35
|
||||
fungus 36
|
||||
chemical_viewed_functionally 37
|
||||
rickettsia_or_chlamydia 38
|
||||
patient_or_disabled_group 39
|
||||
professional_society 40
|
||||
health_care_related_organization 41
|
||||
clinical_attribute 42
|
||||
biomedical_occupation_or_discipline 43
|
||||
temporal_concept 44
|
||||
phenomenon_or_process 45
|
||||
family_group 46
|
||||
chemical_viewed_structurally 47
|
||||
regulation_or_law 48
|
||||
acquired_abnormality 49
|
||||
experimental_model_of_disease 50
|
||||
professional_or_occupational_group 51
|
||||
injury_or_poisoning 52
|
||||
receptor 53
|
||||
drug_delivery_device 54
|
||||
hazardous_or_poisonous_substance 55
|
||||
organism 56
|
||||
neoplastic_process 57
|
||||
mammal 58
|
||||
molecular_function 59
|
||||
lipid 60
|
||||
group_attribute 61
|
||||
nucleotide_sequence 62
|
||||
biologic_function 63
|
||||
chemical 64
|
||||
cell_component 65
|
||||
intellectual_product 66
|
||||
manufactured_object 67
|
||||
classification 68
|
||||
geographic_area 69
|
||||
vitamin 70
|
||||
gene_or_genome 71
|
||||
self_help_or_relief_organization 72
|
||||
pathologic_function 73
|
||||
amphibian 74
|
||||
laboratory_or_test_result 75
|
||||
organism_attribute 76
|
||||
cell_or_molecular_dysfunction 77
|
||||
therapeutic_or_preventive_procedure 78
|
||||
sign_or_symptom 79
|
||||
occupational_activity 80
|
||||
anatomical_abnormality 81
|
||||
hormone 82
|
||||
fully_formed_anatomical_structure 83
|
||||
educational_activity 84
|
||||
quantitative_concept 85
|
||||
tissue 86
|
||||
organism_function 87
|
||||
social_behavior 88
|
||||
mental_or_behavioral_dysfunction 89
|
||||
governmental_or_regulatory_activity 90
|
||||
molecular_biology_research_technique 91
|
||||
occupation_or_discipline 92
|
||||
conceptual_entity 93
|
||||
body_location_or_region 94
|
||||
pharmacologic_substance 95
|
||||
clinical_drug 96
|
||||
food 97
|
||||
substance 98
|
||||
genetic_function 99
|
||||
congenital_abnormality 100
|
||||
medical_device 101
|
||||
carbohydrate 102
|
||||
health_care_activity 103
|
||||
eicosanoid 104
|
||||
element_ion_or_isotope 105
|
||||
diagnostic_procedure 106
|
||||
entity 107
|
||||
event 108
|
||||
laboratory_procedure 109
|
||||
environmental_effect_of_humans 110
|
||||
body_part_organ_or_organ_component 111
|
||||
molecular_sequence 112
|
||||
mental_process 113
|
||||
research_device 114
|
||||
alga 115
|
||||
natural_phenomenon_or_process 116
|
||||
anatomical_structure 117
|
||||
animal 118
|
||||
body_system 119
|
||||
behavior 120
|
||||
carbohydrate_sequence 121
|
||||
archaeon 122
|
||||
research_activity 123
|
||||
organization 124
|
||||
individual_behavior 125
|
||||
organic_chemical 126
|
||||
finding 127
|
||||
age_group 128
|
||||
activity 129
|
||||
machine_activity 130
|
||||
plant 131
|
||||
body_substance 132
|
||||
amino_acid_sequence 133
|
||||
neuroreactive_substance_or_biogenic_amine 134
|
46
dataset/umls/get_neighbor/relation2id.txt
Normal file
46
dataset/umls/get_neighbor/relation2id.txt
Normal file
@ -0,0 +1,46 @@
|
||||
measures 0
|
||||
derivative_of 1
|
||||
disrupts 2
|
||||
prevents 3
|
||||
conceptually_related_to 4
|
||||
manifestation_of 5
|
||||
diagnoses 6
|
||||
evaluation_of 7
|
||||
contains 8
|
||||
co-occurs_with 9
|
||||
conceptual_part_of 10
|
||||
performs 11
|
||||
degree_of 12
|
||||
interacts_with 13
|
||||
uses 14
|
||||
issue_in 15
|
||||
assesses_effect_of 16
|
||||
property_of 17
|
||||
precedes 18
|
||||
result_of 19
|
||||
causes 20
|
||||
practices 21
|
||||
ingredient_of 22
|
||||
analyzes 23
|
||||
surrounds 24
|
||||
indicates 25
|
||||
associated_with 26
|
||||
affects 27
|
||||
location_of 28
|
||||
produces 29
|
||||
process_of 30
|
||||
measurement_of 31
|
||||
connected_to 32
|
||||
carries_out 33
|
||||
method_of 34
|
||||
adjacent_to 35
|
||||
occurs_in 36
|
||||
consists_of 37
|
||||
interconnects 38
|
||||
manages 39
|
||||
complicates 40
|
||||
part_of 41
|
||||
treats 42
|
||||
isa 43
|
||||
developmental_form_of 44
|
||||
exhibits 45
|
661
dataset/umls/get_neighbor/test2id.txt
Normal file
661
dataset/umls/get_neighbor/test2id.txt
Normal file
@ -0,0 +1,661 @@
|
||||
7 13 104
|
||||
42 43 93
|
||||
94 28 21
|
||||
57 43 19
|
||||
102 27 59
|
||||
19 27 5
|
||||
98 15 92
|
||||
75 7 99
|
||||
37 43 107
|
||||
30 22 96
|
||||
89 27 88
|
||||
3 19 73
|
||||
15 13 34
|
||||
82 27 50
|
||||
34 27 28
|
||||
13 41 25
|
||||
86 29 126
|
||||
99 30 5
|
||||
100 41 58
|
||||
11 20 77
|
||||
53 2 32
|
||||
40 28 109
|
||||
87 9 28
|
||||
10 20 77
|
||||
70 27 57
|
||||
34 40 81
|
||||
85 31 21
|
||||
73 27 63
|
||||
100 15 92
|
||||
86 35 24
|
||||
70 2 5
|
||||
53 43 98
|
||||
113 43 87
|
||||
9 45 125
|
||||
94 28 78
|
||||
18 15 43
|
||||
11 27 28
|
||||
57 27 9
|
||||
28 27 58
|
||||
22 14 101
|
||||
3 19 59
|
||||
95 42 57
|
||||
13 28 77
|
||||
50 19 109
|
||||
29 2 28
|
||||
73 40 52
|
||||
89 12 19
|
||||
131 13 118
|
||||
19 30 73
|
||||
73 19 106
|
||||
81 5 63
|
||||
75 5 73
|
||||
16 15 43
|
||||
102 27 77
|
||||
63 27 1
|
||||
113 18 28
|
||||
50 36 89
|
||||
100 41 27
|
||||
3 19 21
|
||||
32 28 78
|
||||
50 30 116
|
||||
57 19 45
|
||||
134 27 87
|
||||
89 30 122
|
||||
100 41 1
|
||||
73 5 99
|
||||
20 20 89
|
||||
95 42 19
|
||||
24 28 28
|
||||
99 27 1
|
||||
49 19 106
|
||||
21 27 122
|
||||
65 28 73
|
||||
91 0 37
|
||||
59 19 3
|
||||
51 14 67
|
||||
53 2 59
|
||||
5 30 63
|
||||
91 0 105
|
||||
21 19 50
|
||||
21 29 20
|
||||
83 41 74
|
||||
29 40 87
|
||||
86 28 1
|
||||
12 43 118
|
||||
113 9 99
|
||||
81 28 27
|
||||
83 28 77
|
||||
5 9 99
|
||||
21 19 3
|
||||
59 19 77
|
||||
106 26 73
|
||||
21 19 57
|
||||
60 13 95
|
||||
65 29 53
|
||||
21 19 100
|
||||
128 11 130
|
||||
100 26 76
|
||||
109 23 15
|
||||
70 27 113
|
||||
65 41 56
|
||||
68 43 93
|
||||
87 19 73
|
||||
87 29 70
|
||||
109 6 89
|
||||
102 27 73
|
||||
65 43 107
|
||||
73 27 56
|
||||
113 27 27
|
||||
75 25 5
|
||||
86 29 132
|
||||
65 41 111
|
||||
59 27 89
|
||||
102 13 10
|
||||
6 26 49
|
||||
59 19 19
|
||||
57 36 52
|
||||
83 41 122
|
||||
102 27 113
|
||||
105 13 55
|
||||
52 2 59
|
||||
51 13 22
|
||||
89 27 122
|
||||
126 27 57
|
||||
98 20 19
|
||||
20 20 19
|
||||
52 2 86
|
||||
73 15 92
|
||||
89 30 17
|
||||
49 5 73
|
||||
105 15 43
|
||||
120 26 128
|
||||
19 40 77
|
||||
83 29 53
|
||||
64 20 52
|
||||
89 27 17
|
||||
63 27 87
|
||||
34 13 134
|
||||
20 27 89
|
||||
109 16 99
|
||||
123 0 15
|
||||
19 27 77
|
||||
73 19 19
|
||||
19 36 89
|
||||
88 26 39
|
||||
34 6 89
|
||||
73 19 87
|
||||
131 13 17
|
||||
89 27 74
|
||||
79 6 89
|
||||
63 19 89
|
||||
63 27 77
|
||||
81 41 9
|
||||
46 13 23
|
||||
50 30 16
|
||||
89 27 116
|
||||
87 27 115
|
||||
65 28 24
|
||||
111 28 99
|
||||
64 27 99
|
||||
37 27 21
|
||||
33 15 43
|
||||
111 28 5
|
||||
123 43 80
|
||||
106 23 55
|
||||
50 36 23
|
||||
31 43 107
|
||||
106 26 100
|
||||
80 26 77
|
||||
5 19 49
|
||||
59 27 28
|
||||
47 22 96
|
||||
89 19 120
|
||||
47 13 102
|
||||
28 27 57
|
||||
73 36 89
|
||||
46 11 78
|
||||
46 29 114
|
||||
15 27 113
|
||||
82 40 63
|
||||
81 19 103
|
||||
87 30 63
|
||||
10 40 19
|
||||
62 17 31
|
||||
124 28 80
|
||||
42 19 99
|
||||
70 20 81
|
||||
89 27 5
|
||||
57 19 89
|
||||
111 29 126
|
||||
1 43 56
|
||||
5 30 21
|
||||
125 26 92
|
||||
55 27 19
|
||||
77 27 21
|
||||
82 2 86
|
||||
59 27 118
|
||||
21 27 59
|
||||
63 27 21
|
||||
75 25 113
|
||||
47 27 5
|
||||
28 19 19
|
||||
15 27 63
|
||||
50 27 63
|
||||
8 27 63
|
||||
91 0 15
|
||||
50 27 27
|
||||
34 27 5
|
||||
4 43 58
|
||||
22 11 6
|
||||
65 10 119
|
||||
32 41 17
|
||||
5 27 4
|
||||
35 13 64
|
||||
77 19 87
|
||||
91 0 104
|
||||
91 0 116
|
||||
76 19 19
|
||||
95 42 52
|
||||
99 27 63
|
||||
23 45 125
|
||||
3 19 45
|
||||
34 27 99
|
||||
82 13 29
|
||||
73 30 122
|
||||
25 13 58
|
||||
134 2 5
|
||||
102 20 77
|
||||
28 27 19
|
||||
32 41 36
|
||||
87 30 4
|
||||
53 40 89
|
||||
99 43 59
|
||||
89 12 77
|
||||
61 17 46
|
||||
95 6 50
|
||||
73 27 115
|
||||
86 28 63
|
||||
87 9 113
|
||||
80 26 57
|
||||
35 27 99
|
||||
102 13 8
|
||||
87 36 44
|
||||
11 20 81
|
||||
77 27 87
|
||||
74 45 88
|
||||
117 41 115
|
||||
60 43 107
|
||||
77 19 19
|
||||
88 26 51
|
||||
32 29 82
|
||||
12 43 107
|
||||
126 20 77
|
||||
49 19 3
|
||||
73 5 19
|
||||
37 15 92
|
||||
50 9 81
|
||||
109 16 105
|
||||
106 0 28
|
||||
47 15 92
|
||||
99 27 19
|
||||
75 9 79
|
||||
15 13 37
|
||||
32 41 27
|
||||
28 27 42
|
||||
83 41 131
|
||||
47 13 60
|
||||
91 0 59
|
||||
36 13 56
|
||||
29 13 70
|
||||
100 5 89
|
||||
78 40 73
|
||||
64 27 5
|
||||
1 28 82
|
||||
5 29 82
|
||||
115 28 134
|
||||
109 27 5
|
||||
73 30 12
|
||||
67 20 77
|
||||
57 27 38
|
||||
77 19 49
|
||||
99 27 131
|
||||
115 43 18
|
||||
46 11 109
|
||||
19 12 77
|
||||
17 45 88
|
||||
78 27 39
|
||||
26 7 125
|
||||
22 14 48
|
||||
34 20 77
|
||||
77 36 89
|
||||
49 5 99
|
||||
27 43 107
|
||||
50 36 128
|
||||
10 20 100
|
||||
109 0 95
|
||||
19 27 16
|
||||
63 19 57
|
||||
78 26 49
|
||||
32 29 70
|
||||
113 30 9
|
||||
113 19 57
|
||||
106 6 77
|
||||
38 28 70
|
||||
57 5 73
|
||||
19 18 57
|
||||
21 19 116
|
||||
75 31 97
|
||||
106 16 105
|
||||
70 20 77
|
||||
121 43 0
|
||||
3 19 52
|
||||
105 20 77
|
||||
126 20 100
|
||||
3 19 50
|
||||
50 40 89
|
||||
5 43 116
|
||||
62 43 112
|
||||
21 27 36
|
||||
50 43 45
|
||||
77 5 52
|
||||
96 20 49
|
||||
65 28 99
|
||||
80 26 19
|
||||
75 26 81
|
||||
128 11 88
|
||||
83 28 21
|
||||
82 13 70
|
||||
59 18 87
|
||||
3 43 108
|
||||
51 11 106
|
||||
19 9 52
|
||||
113 29 20
|
||||
59 29 82
|
||||
57 40 100
|
||||
57 19 113
|
||||
104 15 43
|
||||
41 43 107
|
||||
28 43 63
|
||||
106 23 95
|
||||
10 40 99
|
||||
21 18 99
|
||||
10 40 21
|
||||
89 30 58
|
||||
10 25 57
|
||||
57 30 99
|
||||
63 27 16
|
||||
5 27 25
|
||||
89 19 110
|
||||
55 40 57
|
||||
77 19 88
|
||||
50 19 3
|
||||
105 13 70
|
||||
54 20 52
|
||||
83 28 36
|
||||
83 28 27
|
||||
116 19 19
|
||||
29 40 50
|
||||
125 5 89
|
||||
69 43 0
|
||||
86 43 83
|
||||
79 6 50
|
||||
84 26 73
|
||||
53 27 63
|
||||
5 9 21
|
||||
89 29 70
|
||||
50 19 21
|
||||
82 40 21
|
||||
72 33 84
|
||||
110 43 108
|
||||
64 20 19
|
||||
106 6 100
|
||||
65 41 4
|
||||
50 19 103
|
||||
75 5 50
|
||||
76 31 113
|
||||
28 27 99
|
||||
117 41 131
|
||||
116 19 73
|
||||
100 19 50
|
||||
87 29 53
|
||||
97 20 57
|
||||
82 27 99
|
||||
106 15 43
|
||||
5 30 89
|
||||
25 13 122
|
||||
109 23 30
|
||||
118 13 56
|
||||
109 16 19
|
||||
131 13 115
|
||||
78 3 57
|
||||
100 40 81
|
||||
34 2 87
|
||||
128 11 6
|
||||
71 41 131
|
||||
15 13 134
|
||||
95 20 73
|
||||
60 15 92
|
||||
114 20 81
|
||||
19 30 115
|
||||
81 19 28
|
||||
34 42 50
|
||||
34 40 113
|
||||
52 19 28
|
||||
21 18 5
|
||||
99 19 3
|
||||
85 31 113
|
||||
36 20 73
|
||||
38 28 10
|
||||
104 13 105
|
||||
11 20 57
|
||||
117 15 43
|
||||
10 40 52
|
||||
54 42 52
|
||||
114 43 107
|
||||
20 13 134
|
||||
30 13 20
|
||||
59 27 74
|
||||
89 9 52
|
||||
57 5 50
|
||||
27 28 20
|
||||
126 13 8
|
||||
21 27 116
|
||||
109 43 103
|
||||
57 40 81
|
||||
81 27 9
|
||||
42 5 5
|
||||
13 41 36
|
||||
11 13 29
|
||||
89 9 50
|
||||
29 40 57
|
||||
79 5 5
|
||||
5 9 59
|
||||
128 43 23
|
||||
7 27 57
|
||||
128 45 120
|
||||
19 5 21
|
||||
106 43 108
|
||||
20 2 71
|
||||
81 5 113
|
||||
28 19 21
|
||||
113 30 4
|
||||
64 15 43
|
||||
115 13 4
|
||||
70 27 63
|
||||
83 29 102
|
||||
110 19 49
|
||||
19 19 3
|
||||
126 13 7
|
||||
77 30 116
|
||||
81 41 118
|
||||
106 14 54
|
||||
91 34 106
|
||||
20 20 52
|
||||
81 27 131
|
||||
59 30 12
|
||||
106 0 95
|
||||
105 27 59
|
||||
89 19 57
|
||||
130 43 129
|
||||
31 13 82
|
||||
109 27 57
|
||||
8 43 64
|
||||
73 27 118
|
||||
53 20 73
|
||||
35 20 81
|
||||
134 27 77
|
||||
28 27 38
|
||||
13 28 1
|
||||
78 27 28
|
||||
4 13 56
|
||||
36 20 19
|
||||
32 29 53
|
||||
22 29 48
|
||||
46 11 123
|
||||
70 20 52
|
||||
112 15 43
|
||||
7 15 43
|
||||
27 13 16
|
||||
28 19 113
|
||||
76 17 58
|
||||
81 5 19
|
||||
77 19 110
|
||||
21 27 58
|
||||
83 41 25
|
||||
126 13 82
|
||||
0 15 92
|
||||
39 14 114
|
||||
53 20 49
|
||||
63 19 19
|
||||
20 13 29
|
||||
21 43 63
|
||||
34 40 28
|
||||
55 2 113
|
||||
73 18 77
|
||||
76 17 56
|
||||
30 13 102
|
||||
27 28 53
|
||||
5 19 50
|
||||
83 28 87
|
||||
127 43 93
|
||||
100 43 107
|
||||
86 15 43
|
||||
116 19 57
|
||||
76 5 5
|
||||
78 40 28
|
||||
22 29 101
|
||||
34 13 20
|
||||
34 20 49
|
||||
28 29 70
|
||||
57 27 21
|
||||
110 19 89
|
||||
5 27 87
|
||||
60 27 73
|
||||
109 27 113
|
||||
20 2 65
|
||||
127 5 87
|
||||
87 27 25
|
||||
99 27 21
|
||||
28 19 99
|
||||
34 27 21
|
||||
30 20 73
|
||||
116 27 99
|
||||
57 29 53
|
||||
109 0 8
|
||||
76 31 59
|
||||
21 27 63
|
||||
50 19 57
|
||||
115 13 1
|
||||
100 28 36
|
||||
34 6 77
|
||||
106 0 44
|
||||
89 19 63
|
||||
95 40 63
|
||||
95 2 87
|
||||
81 19 52
|
||||
83 28 59
|
||||
31 13 34
|
||||
134 43 20
|
||||
50 30 27
|
||||
134 13 64
|
||||
77 27 25
|
||||
75 43 93
|
||||
73 26 76
|
||||
49 9 52
|
||||
51 14 54
|
||||
51 6 50
|
||||
77 12 57
|
||||
57 15 43
|
||||
19 19 113
|
||||
57 30 25
|
||||
73 19 81
|
||||
100 5 19
|
||||
76 19 57
|
||||
52 15 92
|
||||
53 20 89
|
||||
42 17 27
|
||||
31 27 50
|
||||
60 20 100
|
||||
47 13 37
|
||||
34 3 73
|
||||
104 43 126
|
||||
20 2 5
|
||||
5 27 99
|
||||
34 27 57
|
||||
83 28 1
|
||||
26 7 129
|
||||
13 41 32
|
||||
29 2 86
|
||||
90 26 19
|
||||
71 28 113
|
||||
57 30 87
|
||||
95 15 43
|
||||
53 40 19
|
||||
19 30 99
|
||||
81 28 1
|
||||
13 41 9
|
||||
87 27 50
|
||||
67 20 89
|
||||
32 41 111
|
||||
59 19 50
|
||||
101 42 49
|
||||
19 27 4
|
||||
111 28 59
|
||||
19 36 57
|
||||
70 43 37
|
||||
65 15 92
|
||||
65 29 31
|
||||
27 43 56
|
||||
77 36 52
|
||||
55 15 92
|
||||
124 28 84
|
||||
86 29 20
|
||||
36 43 18
|
||||
87 19 45
|
||||
87 43 63
|
||||
126 13 31
|
||||
126 27 116
|
||||
106 26 57
|
||||
59 29 134
|
||||
89 30 25
|
||||
47 27 63
|
||||
50 29 20
|
||||
89 26 76
|
||||
109 23 134
|
||||
126 43 98
|
||||
89 27 131
|
||||
6 26 50
|
||||
113 18 5
|
||||
64 27 57
|
||||
82 22 96
|
||||
82 43 20
|
||||
91 0 53
|
||||
50 18 57
|
||||
95 13 134
|
||||
1 13 16
|
||||
49 27 1
|
||||
73 19 59
|
||||
13 28 36
|
||||
95 27 77
|
||||
63 19 49
|
||||
57 9 81
|
||||
57 19 49
|
||||
111 29 132
|
||||
77 30 89
|
||||
84 15 92
|
||||
73 5 57
|
||||
1 20 73
|
||||
52 40 50
|
||||
104 13 95
|
||||
59 19 116
|
||||
57 29 86
|
||||
106 16 70
|
||||
81 5 5
|
||||
55 22 96
|
||||
30 27 77
|
||||
109 0 76
|
||||
37 13 10
|
||||
106 6 19
|
||||
52 40 19
|
||||
59 19 100
|
||||
63 27 27
|
||||
87 29 82
|
||||
125 26 57
|
||||
116 27 28
|
||||
6 26 73
|
||||
83 28 38
|
||||
5 27 73
|
||||
57 27 74
|
||||
49 36 128
|
||||
113 27 76
|
||||
91 0 57
|
||||
19 36 39
|
||||
89 27 58
|
||||
110 43 45
|
||||
77 18 50
|
||||
75 43 107
|
||||
1 13 122
|
||||
35 20 89
|
||||
117 41 36
|
||||
77 30 25
|
5216
dataset/umls/get_neighbor/train2id.txt
Normal file
5216
dataset/umls/get_neighbor/train2id.txt
Normal file
File diff suppressed because it is too large
Load Diff
652
dataset/umls/get_neighbor/valid2id.txt
Normal file
652
dataset/umls/get_neighbor/valid2id.txt
Normal file
@ -0,0 +1,652 @@
|
||||
31 27 89
|
||||
39 11 125
|
||||
57 30 59
|
||||
60 27 63
|
||||
57 27 115
|
||||
34 27 77
|
||||
104 27 89
|
||||
83 28 52
|
||||
130 34 109
|
||||
77 43 73
|
||||
91 0 87
|
||||
87 27 118
|
||||
39 11 90
|
||||
109 0 113
|
||||
86 24 24
|
||||
81 27 87
|
||||
131 28 20
|
||||
73 12 89
|
||||
38 28 134
|
||||
7 20 81
|
||||
30 43 126
|
||||
15 13 104
|
||||
128 29 114
|
||||
49 19 99
|
||||
126 13 64
|
||||
12 13 16
|
||||
71 29 31
|
||||
29 43 20
|
||||
32 28 24
|
||||
42 12 76
|
||||
70 43 20
|
||||
118 13 58
|
||||
52 19 50
|
||||
87 9 21
|
||||
15 13 20
|
||||
95 2 28
|
||||
113 30 25
|
||||
49 19 88
|
||||
123 0 44
|
||||
7 43 98
|
||||
59 30 28
|
||||
51 11 103
|
||||
19 19 21
|
||||
94 28 52
|
||||
34 40 49
|
||||
30 13 15
|
||||
134 40 52
|
||||
71 29 132
|
||||
52 26 42
|
||||
28 27 4
|
||||
127 26 52
|
||||
109 0 126
|
||||
19 19 89
|
||||
21 18 113
|
||||
24 15 92
|
||||
89 30 118
|
||||
49 19 21
|
||||
49 19 52
|
||||
0 43 93
|
||||
59 30 122
|
||||
81 19 87
|
||||
41 28 91
|
||||
104 20 57
|
||||
73 18 57
|
||||
110 19 52
|
||||
105 20 57
|
||||
59 27 9
|
||||
134 20 49
|
||||
7 20 100
|
||||
11 13 64
|
||||
86 29 31
|
||||
111 28 28
|
||||
76 17 118
|
||||
104 13 35
|
||||
19 27 113
|
||||
77 30 19
|
||||
73 19 63
|
||||
127 5 89
|
||||
100 28 27
|
||||
8 20 57
|
||||
37 13 8
|
||||
50 30 19
|
||||
73 27 50
|
||||
53 40 73
|
||||
47 27 50
|
||||
16 45 125
|
||||
10 43 107
|
||||
106 0 59
|
||||
102 43 107
|
||||
73 30 131
|
||||
133 17 71
|
||||
77 27 63
|
||||
97 43 107
|
||||
57 30 4
|
||||
55 40 81
|
||||
94 28 19
|
||||
28 30 118
|
||||
116 27 5
|
||||
134 43 37
|
||||
30 13 105
|
||||
99 19 19
|
||||
57 30 12
|
||||
109 16 50
|
||||
115 43 56
|
||||
42 31 5
|
||||
4 43 107
|
||||
112 43 0
|
||||
55 27 59
|
||||
15 43 47
|
||||
128 15 43
|
||||
75 31 105
|
||||
124 28 109
|
||||
7 43 126
|
||||
78 27 19
|
||||
116 19 5
|
||||
37 20 81
|
||||
69 26 52
|
||||
121 19 113
|
||||
99 19 110
|
||||
8 27 77
|
||||
37 27 73
|
||||
59 27 122
|
||||
57 5 5
|
||||
86 29 134
|
||||
35 20 77
|
||||
75 7 113
|
||||
8 20 77
|
||||
57 19 5
|
||||
99 29 134
|
||||
89 19 5
|
||||
113 27 12
|
||||
35 27 116
|
||||
89 26 73
|
||||
113 27 57
|
||||
28 27 63
|
||||
50 5 99
|
||||
11 20 100
|
||||
75 31 126
|
||||
18 43 107
|
||||
94 28 73
|
||||
134 40 28
|
||||
123 27 113
|
||||
109 0 73
|
||||
15 20 77
|
||||
49 27 4
|
||||
106 27 73
|
||||
10 40 77
|
||||
27 15 43
|
||||
53 40 99
|
||||
57 40 50
|
||||
5 27 28
|
||||
78 43 103
|
||||
50 19 88
|
||||
78 34 43
|
||||
104 27 113
|
||||
54 20 100
|
||||
87 27 38
|
||||
89 29 29
|
||||
67 20 57
|
||||
47 13 10
|
||||
28 30 36
|
||||
21 30 12
|
||||
116 19 100
|
||||
70 40 100
|
||||
71 41 111
|
||||
19 19 45
|
||||
19 27 118
|
||||
39 11 80
|
||||
76 19 87
|
||||
20 27 73
|
||||
13 28 50
|
||||
70 27 21
|
||||
101 20 77
|
||||
123 0 34
|
||||
54 42 49
|
||||
56 43 18
|
||||
59 30 36
|
||||
21 19 5
|
||||
34 3 19
|
||||
101 20 89
|
||||
31 27 113
|
||||
77 30 21
|
||||
64 27 89
|
||||
31 13 134
|
||||
131 13 27
|
||||
126 13 37
|
||||
50 26 42
|
||||
100 41 56
|
||||
71 28 50
|
||||
111 28 36
|
||||
15 27 73
|
||||
99 29 82
|
||||
109 26 81
|
||||
34 20 73
|
||||
49 27 21
|
||||
51 43 23
|
||||
79 26 49
|
||||
29 20 100
|
||||
99 30 28
|
||||
70 40 21
|
||||
42 31 59
|
||||
13 28 89
|
||||
52 19 45
|
||||
47 27 116
|
||||
28 27 89
|
||||
113 27 88
|
||||
63 30 1
|
||||
106 23 35
|
||||
50 27 21
|
||||
1 28 53
|
||||
26 7 103
|
||||
28 27 115
|
||||
89 30 63
|
||||
113 30 5
|
||||
5 19 52
|
||||
57 18 77
|
||||
19 12 89
|
||||
39 29 101
|
||||
34 13 64
|
||||
19 5 57
|
||||
28 30 87
|
||||
76 5 28
|
||||
115 15 43
|
||||
40 15 43
|
||||
45 19 87
|
||||
64 27 87
|
||||
75 5 87
|
||||
100 27 87
|
||||
6 26 52
|
||||
75 31 29
|
||||
100 41 25
|
||||
57 5 113
|
||||
109 23 47
|
||||
19 19 63
|
||||
82 2 32
|
||||
77 5 59
|
||||
128 29 48
|
||||
113 27 25
|
||||
101 42 89
|
||||
45 19 113
|
||||
13 41 1
|
||||
59 27 17
|
||||
78 3 50
|
||||
60 43 98
|
||||
109 16 21
|
||||
16 13 56
|
||||
131 43 18
|
||||
71 43 107
|
||||
42 17 12
|
||||
106 23 105
|
||||
34 27 116
|
||||
71 29 70
|
||||
57 27 116
|
||||
57 19 103
|
||||
106 0 53
|
||||
27 13 122
|
||||
21 27 76
|
||||
82 13 53
|
||||
40 33 109
|
||||
32 28 5
|
||||
15 22 96
|
||||
3 19 116
|
||||
123 15 92
|
||||
37 20 49
|
||||
17 43 9
|
||||
63 27 12
|
||||
57 27 56
|
||||
70 27 116
|
||||
34 6 19
|
||||
49 5 21
|
||||
95 43 64
|
||||
128 45 88
|
||||
87 30 118
|
||||
51 11 130
|
||||
50 43 108
|
||||
57 30 19
|
||||
49 28 19
|
||||
108 15 43
|
||||
89 36 51
|
||||
35 27 50
|
||||
89 43 63
|
||||
103 34 92
|
||||
105 27 50
|
||||
131 13 36
|
||||
39 15 92
|
||||
72 33 80
|
||||
123 0 59
|
||||
49 41 74
|
||||
53 27 113
|
||||
31 20 52
|
||||
77 27 5
|
||||
76 19 50
|
||||
73 27 27
|
||||
40 28 103
|
||||
55 2 13
|
||||
118 45 88
|
||||
63 19 100
|
||||
73 27 113
|
||||
106 0 15
|
||||
59 9 21
|
||||
46 14 101
|
||||
23 11 130
|
||||
109 26 73
|
||||
57 9 100
|
||||
109 0 35
|
||||
81 19 73
|
||||
94 28 28
|
||||
123 0 7
|
||||
12 20 57
|
||||
109 23 82
|
||||
19 27 63
|
||||
73 27 99
|
||||
86 15 92
|
||||
63 27 131
|
||||
81 27 17
|
||||
94 28 89
|
||||
101 42 73
|
||||
76 19 28
|
||||
71 28 1
|
||||
71 41 86
|
||||
86 29 82
|
||||
75 25 57
|
||||
89 40 52
|
||||
20 20 50
|
||||
78 15 43
|
||||
85 31 94
|
||||
51 43 107
|
||||
71 27 5
|
||||
104 27 19
|
||||
10 40 87
|
||||
71 41 17
|
||||
75 5 59
|
||||
89 36 46
|
||||
78 42 89
|
||||
22 43 23
|
||||
94 28 86
|
||||
85 31 112
|
||||
109 43 129
|
||||
106 16 30
|
||||
71 15 92
|
||||
5 30 17
|
||||
69 43 93
|
||||
134 27 89
|
||||
20 43 64
|
||||
29 2 13
|
||||
1 28 70
|
||||
51 14 48
|
||||
50 19 78
|
||||
35 20 57
|
||||
79 7 63
|
||||
21 30 74
|
||||
68 15 43
|
||||
87 29 20
|
||||
75 31 64
|
||||
10 2 111
|
||||
103 15 43
|
||||
102 13 34
|
||||
57 19 106
|
||||
89 19 87
|
||||
65 28 5
|
||||
30 15 92
|
||||
65 28 50
|
||||
60 20 49
|
||||
50 19 113
|
||||
81 19 77
|
||||
28 43 21
|
||||
49 5 28
|
||||
75 26 19
|
||||
113 29 82
|
||||
58 45 120
|
||||
6 26 57
|
||||
96 20 52
|
||||
123 26 73
|
||||
77 30 4
|
||||
111 41 12
|
||||
54 42 79
|
||||
134 27 19
|
||||
9 43 18
|
||||
50 19 106
|
||||
54 43 107
|
||||
78 14 96
|
||||
29 27 77
|
||||
106 23 134
|
||||
74 45 125
|
||||
89 30 21
|
||||
109 6 77
|
||||
78 40 113
|
||||
7 13 11
|
||||
21 27 131
|
||||
43 43 93
|
||||
109 23 102
|
||||
104 13 53
|
||||
128 11 91
|
||||
105 13 29
|
||||
55 2 65
|
||||
100 19 21
|
||||
30 13 134
|
||||
81 41 27
|
||||
96 20 81
|
||||
24 15 43
|
||||
78 27 113
|
||||
103 26 52
|
||||
59 18 5
|
||||
41 33 123
|
||||
28 30 59
|
||||
57 27 50
|
||||
106 27 77
|
||||
106 15 92
|
||||
90 34 43
|
||||
75 5 28
|
||||
51 29 48
|
||||
75 31 95
|
||||
95 27 50
|
||||
53 27 28
|
||||
134 20 81
|
||||
111 29 70
|
||||
82 27 63
|
||||
83 28 19
|
||||
53 27 21
|
||||
123 0 76
|
||||
127 5 5
|
||||
89 5 21
|
||||
103 27 89
|
||||
34 13 10
|
||||
19 29 132
|
||||
106 0 8
|
||||
64 27 116
|
||||
123 0 8
|
||||
111 10 119
|
||||
19 27 27
|
||||
64 20 81
|
||||
87 19 113
|
||||
77 36 128
|
||||
73 27 74
|
||||
59 43 45
|
||||
109 23 70
|
||||
90 26 73
|
||||
113 19 49
|
||||
86 29 30
|
||||
71 41 65
|
||||
89 27 118
|
||||
10 20 49
|
||||
34 42 49
|
||||
104 43 60
|
||||
134 20 73
|
||||
34 42 100
|
||||
49 41 131
|
||||
89 30 113
|
||||
51 45 125
|
||||
65 28 63
|
||||
55 43 37
|
||||
28 19 59
|
||||
105 22 96
|
||||
49 27 74
|
||||
23 14 68
|
||||
126 13 104
|
||||
53 43 20
|
||||
20 27 59
|
||||
73 18 89
|
||||
109 16 20
|
||||
28 29 82
|
||||
20 2 13
|
||||
63 29 53
|
||||
115 28 82
|
||||
50 29 53
|
||||
5 36 113
|
||||
31 27 59
|
||||
49 41 38
|
||||
101 42 50
|
||||
57 30 50
|
||||
69 26 77
|
||||
30 13 7
|
||||
28 43 116
|
||||
19 19 88
|
||||
89 36 39
|
||||
52 36 51
|
||||
55 40 100
|
||||
12 20 73
|
||||
49 36 51
|
||||
60 27 89
|
||||
42 26 76
|
||||
60 27 113
|
||||
12 13 17
|
||||
71 41 9
|
||||
5 30 58
|
||||
132 10 119
|
||||
111 29 134
|
||||
102 13 11
|
||||
81 41 58
|
||||
116 27 59
|
||||
98 20 77
|
||||
13 24 32
|
||||
52 43 45
|
||||
106 6 81
|
||||
24 28 52
|
||||
28 19 50
|
||||
134 40 99
|
||||
50 19 110
|
||||
103 27 77
|
||||
40 33 106
|
||||
103 27 113
|
||||
23 29 114
|
||||
65 28 100
|
||||
9 43 118
|
||||
91 0 8
|
||||
40 29 68
|
||||
133 43 0
|
||||
99 9 21
|
||||
89 5 99
|
||||
63 30 58
|
||||
125 27 88
|
||||
73 9 52
|
||||
12 20 50
|
||||
16 13 122
|
||||
114 20 19
|
||||
85 15 43
|
||||
40 28 78
|
||||
54 3 19
|
||||
83 41 12
|
||||
58 43 107
|
||||
111 29 53
|
||||
59 27 58
|
||||
109 23 8
|
||||
3 43 45
|
||||
50 30 9
|
||||
40 33 123
|
||||
50 18 77
|
||||
50 27 74
|
||||
109 16 55
|
||||
81 15 43
|
||||
82 27 113
|
||||
109 23 95
|
||||
94 28 99
|
||||
19 19 52
|
||||
109 16 57
|
||||
100 27 118
|
||||
8 13 10
|
||||
87 43 116
|
||||
68 43 66
|
||||
116 19 81
|
||||
37 27 57
|
||||
133 19 113
|
||||
42 17 17
|
||||
58 45 125
|
||||
116 27 19
|
||||
5 30 57
|
||||
20 40 113
|
||||
109 16 8
|
||||
8 13 64
|
||||
57 26 77
|
||||
26 43 0
|
||||
79 7 50
|
||||
134 13 53
|
||||
32 28 73
|
||||
106 16 29
|
||||
49 41 115
|
||||
30 13 55
|
||||
106 16 60
|
||||
36 13 12
|
||||
75 31 21
|
||||
49 27 113
|
||||
19 27 17
|
||||
133 43 107
|
||||
113 19 63
|
||||
126 27 63
|
||||
7 13 82
|
||||
73 19 49
|
||||
123 0 47
|
||||
78 26 89
|
||||
21 19 113
|
||||
42 19 3
|
||||
109 0 34
|
||||
32 41 12
|
||||
70 40 77
|
||||
42 5 59
|
||||
87 19 49
|
||||
51 13 128
|
||||
116 27 57
|
||||
124 33 123
|
||||
13 41 27
|
||||
83 29 29
|
||||
126 13 35
|
||||
116 19 3
|
||||
57 27 73
|
||||
83 15 43
|
||||
110 19 50
|
||||
50 5 21
|
||||
111 28 113
|
||||
53 20 52
|
||||
79 6 19
|
||||
34 2 113
|
||||
113 18 87
|
||||
47 27 77
|
||||
70 2 59
|
||||
95 20 52
|
||||
51 11 90
|
||||
84 43 129
|
||||
100 28 19
|
||||
57 9 73
|
||||
37 20 89
|
||||
63 30 4
|
||||
82 40 89
|
||||
13 28 38
|
||||
100 19 89
|
||||
5 29 29
|
||||
91 0 50
|
||||
19 30 87
|
||||
127 5 19
|
||||
73 30 58
|
||||
5 30 4
|
||||
35 27 21
|
||||
41 33 91
|
||||
55 2 5
|
||||
113 30 12
|
||||
86 28 50
|
||||
34 43 95
|
||||
78 3 89
|
||||
7 27 19
|
||||
95 3 19
|
||||
120 19 113
|
||||
88 26 69
|
||||
86 41 111
|
||||
59 27 38
|
||||
22 11 90
|
||||
20 2 87
|
||||
49 43 81
|
||||
59 27 115
|
||||
100 19 3
|
||||
100 19 110
|
||||
57 30 89
|
||||
14 43 107
|
||||
2 43 93
|
||||
89 30 77
|
||||
8 20 81
|
||||
55 20 100
|
||||
34 2 32
|
||||
19 27 115
|
||||
127 5 50
|
||||
105 27 116
|
||||
74 13 122
|
||||
24 28 113
|
||||
98 20 57
|
||||
63 27 99
|
||||
35 20 52
|
||||
123 0 95
|
||||
52 19 110
|
||||
124 15 92
|
||||
5 30 113
|
||||
123 26 89
|
||||
4 15 43
|
||||
59 27 19
|
||||
104 27 73
|
46
dataset/umls/relation2text.txt
Normal file
46
dataset/umls/relation2text.txt
Normal file
@ -0,0 +1,46 @@
|
||||
measures measures
|
||||
derivative_of derivative of
|
||||
disrupts disrupts
|
||||
prevents prevents
|
||||
conceptually_related_to conceptually related to
|
||||
manifestation_of manifestation of
|
||||
diagnoses diagnoses
|
||||
evaluation_of evaluation of
|
||||
contains contains
|
||||
co-occurs_with co-occurs with
|
||||
conceptual_part_of conceptual part of
|
||||
performs performs
|
||||
degree_of degree of
|
||||
interacts_with interacts with
|
||||
uses uses
|
||||
issue_in issue in
|
||||
assesses_effect_of assesses effect of
|
||||
property_of property of
|
||||
precedes precedes
|
||||
result_of result of
|
||||
causes causes
|
||||
practices practices
|
||||
ingredient_of ingredient of
|
||||
analyzes analyzes
|
||||
surrounds surrounds
|
||||
indicates indicates
|
||||
associated_with associated with
|
||||
affects affects
|
||||
location_of location of
|
||||
produces produces
|
||||
process_of process of
|
||||
measurement_of measurement of
|
||||
connected_to connected to
|
||||
carries_out carries out
|
||||
method_of method of
|
||||
adjacent_to adjacent to
|
||||
occurs_in occurs in
|
||||
consists_of consists of
|
||||
interconnects interconnects
|
||||
manages manages
|
||||
complicates complicates
|
||||
part_of part of
|
||||
treats treats
|
||||
isa is a
|
||||
developmental_form_of developmental form of
|
||||
exhibits exhibits
|
46
dataset/umls/relations.txt
Normal file
46
dataset/umls/relations.txt
Normal file
@ -0,0 +1,46 @@
|
||||
measures
|
||||
derivative_of
|
||||
disrupts
|
||||
prevents
|
||||
conceptually_related_to
|
||||
manifestation_of
|
||||
diagnoses
|
||||
evaluation_of
|
||||
contains
|
||||
co-occurs_with
|
||||
conceptual_part_of
|
||||
performs
|
||||
degree_of
|
||||
interacts_with
|
||||
uses
|
||||
issue_in
|
||||
assesses_effect_of
|
||||
property_of
|
||||
precedes
|
||||
result_of
|
||||
causes
|
||||
practices
|
||||
ingredient_of
|
||||
analyzes
|
||||
surrounds
|
||||
indicates
|
||||
associated_with
|
||||
affects
|
||||
location_of
|
||||
produces
|
||||
process_of
|
||||
measurement_of
|
||||
connected_to
|
||||
carries_out
|
||||
method_of
|
||||
adjacent_to
|
||||
occurs_in
|
||||
consists_of
|
||||
interconnects
|
||||
manages
|
||||
complicates
|
||||
part_of
|
||||
treats
|
||||
isa
|
||||
developmental_form_of
|
||||
exhibits
|
661
dataset/umls/test.tsv
Normal file
661
dataset/umls/test.tsv
Normal file
@ -0,0 +1,661 @@
|
||||
steroid interacts_with eicosanoid
|
||||
clinical_attribute isa conceptual_entity
|
||||
body_location_or_region location_of physiologic_function
|
||||
neoplastic_process isa disease_or_syndrome
|
||||
carbohydrate affects molecular_function
|
||||
disease_or_syndrome affects organ_or_tissue_function
|
||||
substance issue_in occupation_or_discipline
|
||||
laboratory_or_test_result evaluation_of genetic_function
|
||||
chemical_viewed_functionally isa entity
|
||||
organophosphorus_compound ingredient_of clinical_drug
|
||||
mental_or_behavioral_dysfunction affects social_behavior
|
||||
human_caused_phenomenon_or_process result_of pathologic_function
|
||||
amino_acid_peptide_or_protein interacts_with antibiotic
|
||||
hormone affects experimental_model_of_disease
|
||||
antibiotic affects cell_function
|
||||
embryonic_structure part_of bird
|
||||
tissue produces organic_chemical
|
||||
genetic_function process_of organ_or_tissue_function
|
||||
congenital_abnormality part_of mammal
|
||||
inorganic_chemical causes cell_or_molecular_dysfunction
|
||||
receptor disrupts cell
|
||||
professional_society location_of laboratory_procedure
|
||||
organism_function co-occurs_with cell_function
|
||||
immunologic_factor causes cell_or_molecular_dysfunction
|
||||
vitamin affects neoplastic_process
|
||||
antibiotic complicates anatomical_abnormality
|
||||
quantitative_concept measurement_of physiologic_function
|
||||
pathologic_function affects biologic_function
|
||||
congenital_abnormality issue_in occupation_or_discipline
|
||||
tissue adjacent_to body_space_or_junction
|
||||
vitamin disrupts organ_or_tissue_function
|
||||
receptor isa substance
|
||||
mental_process isa organism_function
|
||||
vertebrate exhibits individual_behavior
|
||||
body_location_or_region location_of therapeutic_or_preventive_procedure
|
||||
physical_object issue_in biomedical_occupation_or_discipline
|
||||
inorganic_chemical affects cell_function
|
||||
neoplastic_process affects vertebrate
|
||||
cell_function affects mammal
|
||||
population_group uses medical_device
|
||||
human_caused_phenomenon_or_process result_of molecular_function
|
||||
pharmacologic_substance treats neoplastic_process
|
||||
embryonic_structure location_of cell_or_molecular_dysfunction
|
||||
experimental_model_of_disease result_of laboratory_procedure
|
||||
enzyme disrupts cell_function
|
||||
pathologic_function complicates injury_or_poisoning
|
||||
mental_or_behavioral_dysfunction degree_of disease_or_syndrome
|
||||
plant interacts_with animal
|
||||
disease_or_syndrome process_of pathologic_function
|
||||
pathologic_function result_of diagnostic_procedure
|
||||
anatomical_abnormality manifestation_of biologic_function
|
||||
laboratory_or_test_result manifestation_of pathologic_function
|
||||
fish issue_in biomedical_occupation_or_discipline
|
||||
carbohydrate affects cell_or_molecular_dysfunction
|
||||
biologic_function affects virus
|
||||
mental_process precedes cell_function
|
||||
experimental_model_of_disease occurs_in mental_or_behavioral_dysfunction
|
||||
congenital_abnormality part_of bacterium
|
||||
human_caused_phenomenon_or_process result_of physiologic_function
|
||||
cell location_of therapeutic_or_preventive_procedure
|
||||
experimental_model_of_disease process_of natural_phenomenon_or_process
|
||||
neoplastic_process result_of phenomenon_or_process
|
||||
neuroreactive_substance_or_biogenic_amine affects organism_function
|
||||
mental_or_behavioral_dysfunction process_of archaeon
|
||||
congenital_abnormality part_of virus
|
||||
pathologic_function manifestation_of genetic_function
|
||||
biologically_active_substance causes mental_or_behavioral_dysfunction
|
||||
pharmacologic_substance treats disease_or_syndrome
|
||||
body_space_or_junction location_of cell_function
|
||||
genetic_function affects virus
|
||||
acquired_abnormality result_of diagnostic_procedure
|
||||
physiologic_function affects archaeon
|
||||
cell_component location_of pathologic_function
|
||||
molecular_biology_research_technique measures chemical_viewed_functionally
|
||||
molecular_function result_of human_caused_phenomenon_or_process
|
||||
professional_or_occupational_group uses manufactured_object
|
||||
receptor disrupts molecular_function
|
||||
organ_or_tissue_function process_of biologic_function
|
||||
molecular_biology_research_technique measures element_ion_or_isotope
|
||||
physiologic_function result_of experimental_model_of_disease
|
||||
physiologic_function produces biologically_active_substance
|
||||
fully_formed_anatomical_structure part_of amphibian
|
||||
enzyme complicates organism_function
|
||||
tissue location_of virus
|
||||
invertebrate isa animal
|
||||
mental_process co-occurs_with genetic_function
|
||||
anatomical_abnormality location_of bacterium
|
||||
fully_formed_anatomical_structure location_of cell_or_molecular_dysfunction
|
||||
organ_or_tissue_function co-occurs_with genetic_function
|
||||
physiologic_function result_of human_caused_phenomenon_or_process
|
||||
molecular_function result_of cell_or_molecular_dysfunction
|
||||
diagnostic_procedure associated_with pathologic_function
|
||||
physiologic_function result_of neoplastic_process
|
||||
lipid interacts_with pharmacologic_substance
|
||||
cell_component produces receptor
|
||||
physiologic_function result_of congenital_abnormality
|
||||
age_group performs machine_activity
|
||||
congenital_abnormality associated_with organism_attribute
|
||||
laboratory_procedure analyzes amino_acid_peptide_or_protein
|
||||
vitamin affects mental_process
|
||||
cell_component part_of organism
|
||||
classification isa conceptual_entity
|
||||
organism_function result_of pathologic_function
|
||||
organism_function produces vitamin
|
||||
laboratory_procedure diagnoses mental_or_behavioral_dysfunction
|
||||
carbohydrate affects pathologic_function
|
||||
cell_component isa entity
|
||||
pathologic_function affects organism
|
||||
mental_process affects bacterium
|
||||
laboratory_or_test_result indicates organ_or_tissue_function
|
||||
tissue produces body_substance
|
||||
cell_component part_of body_part_organ_or_organ_component
|
||||
molecular_function affects mental_or_behavioral_dysfunction
|
||||
carbohydrate interacts_with immunologic_factor
|
||||
daily_or_recreational_activity associated_with acquired_abnormality
|
||||
molecular_function result_of disease_or_syndrome
|
||||
neoplastic_process occurs_in injury_or_poisoning
|
||||
fully_formed_anatomical_structure part_of archaeon
|
||||
carbohydrate affects mental_process
|
||||
element_ion_or_isotope interacts_with hazardous_or_poisonous_substance
|
||||
injury_or_poisoning disrupts molecular_function
|
||||
professional_or_occupational_group interacts_with population_group
|
||||
mental_or_behavioral_dysfunction affects archaeon
|
||||
organic_chemical affects neoplastic_process
|
||||
substance causes disease_or_syndrome
|
||||
biologically_active_substance causes disease_or_syndrome
|
||||
injury_or_poisoning disrupts tissue
|
||||
pathologic_function issue_in occupation_or_discipline
|
||||
mental_or_behavioral_dysfunction process_of reptile
|
||||
acquired_abnormality manifestation_of pathologic_function
|
||||
element_ion_or_isotope issue_in biomedical_occupation_or_discipline
|
||||
behavior associated_with age_group
|
||||
disease_or_syndrome complicates cell_or_molecular_dysfunction
|
||||
fully_formed_anatomical_structure produces receptor
|
||||
chemical causes injury_or_poisoning
|
||||
mental_or_behavioral_dysfunction affects reptile
|
||||
biologic_function affects organism_function
|
||||
antibiotic interacts_with neuroreactive_substance_or_biogenic_amine
|
||||
biologically_active_substance affects mental_or_behavioral_dysfunction
|
||||
laboratory_procedure assesses_effect_of genetic_function
|
||||
research_activity measures amino_acid_peptide_or_protein
|
||||
disease_or_syndrome affects cell_or_molecular_dysfunction
|
||||
pathologic_function result_of disease_or_syndrome
|
||||
disease_or_syndrome occurs_in mental_or_behavioral_dysfunction
|
||||
social_behavior associated_with patient_or_disabled_group
|
||||
antibiotic diagnoses mental_or_behavioral_dysfunction
|
||||
pathologic_function result_of organism_function
|
||||
plant interacts_with reptile
|
||||
mental_or_behavioral_dysfunction affects amphibian
|
||||
sign_or_symptom diagnoses mental_or_behavioral_dysfunction
|
||||
biologic_function result_of mental_or_behavioral_dysfunction
|
||||
biologic_function affects cell_or_molecular_dysfunction
|
||||
anatomical_abnormality part_of vertebrate
|
||||
family_group interacts_with group
|
||||
experimental_model_of_disease process_of fish
|
||||
mental_or_behavioral_dysfunction affects natural_phenomenon_or_process
|
||||
organism_function affects alga
|
||||
cell_component location_of body_space_or_junction
|
||||
body_part_organ_or_organ_component location_of genetic_function
|
||||
chemical affects genetic_function
|
||||
chemical_viewed_functionally affects physiologic_function
|
||||
language issue_in biomedical_occupation_or_discipline
|
||||
body_part_organ_or_organ_component location_of organ_or_tissue_function
|
||||
research_activity isa occupational_activity
|
||||
diagnostic_procedure analyzes hazardous_or_poisonous_substance
|
||||
experimental_model_of_disease occurs_in group
|
||||
nucleic_acid_nucleoside_or_nucleotide isa entity
|
||||
diagnostic_procedure associated_with congenital_abnormality
|
||||
occupational_activity associated_with cell_or_molecular_dysfunction
|
||||
organ_or_tissue_function result_of acquired_abnormality
|
||||
molecular_function affects cell_function
|
||||
chemical_viewed_structurally ingredient_of clinical_drug
|
||||
mental_or_behavioral_dysfunction result_of behavior
|
||||
chemical_viewed_structurally interacts_with carbohydrate
|
||||
cell_function affects neoplastic_process
|
||||
pathologic_function occurs_in mental_or_behavioral_dysfunction
|
||||
family_group performs therapeutic_or_preventive_procedure
|
||||
family_group produces research_device
|
||||
amino_acid_peptide_or_protein affects mental_process
|
||||
hormone complicates biologic_function
|
||||
anatomical_abnormality result_of health_care_activity
|
||||
organism_function process_of biologic_function
|
||||
immunologic_factor complicates disease_or_syndrome
|
||||
nucleotide_sequence property_of nucleic_acid_nucleoside_or_nucleotide
|
||||
organization location_of occupational_activity
|
||||
clinical_attribute result_of genetic_function
|
||||
vitamin causes anatomical_abnormality
|
||||
mental_or_behavioral_dysfunction affects organ_or_tissue_function
|
||||
neoplastic_process result_of mental_or_behavioral_dysfunction
|
||||
body_part_organ_or_organ_component produces organic_chemical
|
||||
virus isa organism
|
||||
organ_or_tissue_function process_of physiologic_function
|
||||
individual_behavior associated_with occupation_or_discipline
|
||||
hazardous_or_poisonous_substance affects disease_or_syndrome
|
||||
cell_or_molecular_dysfunction affects physiologic_function
|
||||
hormone disrupts tissue
|
||||
molecular_function affects animal
|
||||
physiologic_function affects molecular_function
|
||||
biologic_function affects physiologic_function
|
||||
laboratory_or_test_result indicates mental_process
|
||||
chemical_viewed_structurally affects organ_or_tissue_function
|
||||
cell_function result_of disease_or_syndrome
|
||||
amino_acid_peptide_or_protein affects biologic_function
|
||||
experimental_model_of_disease affects biologic_function
|
||||
biomedical_or_dental_material affects biologic_function
|
||||
molecular_biology_research_technique measures amino_acid_peptide_or_protein
|
||||
experimental_model_of_disease affects bacterium
|
||||
antibiotic affects organ_or_tissue_function
|
||||
human isa mammal
|
||||
population_group performs daily_or_recreational_activity
|
||||
cell_component conceptual_part_of body_system
|
||||
cell part_of reptile
|
||||
organ_or_tissue_function affects human
|
||||
indicator_reagent_or_diagnostic_aid interacts_with chemical
|
||||
cell_or_molecular_dysfunction result_of organism_function
|
||||
molecular_biology_research_technique measures eicosanoid
|
||||
molecular_biology_research_technique measures natural_phenomenon_or_process
|
||||
organism_attribute result_of disease_or_syndrome
|
||||
pharmacologic_substance treats injury_or_poisoning
|
||||
genetic_function affects biologic_function
|
||||
group exhibits individual_behavior
|
||||
human_caused_phenomenon_or_process result_of phenomenon_or_process
|
||||
antibiotic affects genetic_function
|
||||
hormone interacts_with enzyme
|
||||
pathologic_function process_of archaeon
|
||||
bird interacts_with mammal
|
||||
neuroreactive_substance_or_biogenic_amine disrupts organ_or_tissue_function
|
||||
carbohydrate causes cell_or_molecular_dysfunction
|
||||
cell_function affects disease_or_syndrome
|
||||
cell part_of fungus
|
||||
organism_function process_of human
|
||||
receptor complicates mental_or_behavioral_dysfunction
|
||||
genetic_function isa molecular_function
|
||||
mental_or_behavioral_dysfunction degree_of cell_or_molecular_dysfunction
|
||||
group_attribute property_of family_group
|
||||
pharmacologic_substance diagnoses experimental_model_of_disease
|
||||
pathologic_function affects alga
|
||||
tissue location_of biologic_function
|
||||
organism_function co-occurs_with mental_process
|
||||
occupational_activity associated_with neoplastic_process
|
||||
indicator_reagent_or_diagnostic_aid affects genetic_function
|
||||
carbohydrate interacts_with biomedical_or_dental_material
|
||||
organism_function occurs_in temporal_concept
|
||||
inorganic_chemical causes anatomical_abnormality
|
||||
cell_or_molecular_dysfunction affects organism_function
|
||||
amphibian exhibits social_behavior
|
||||
anatomical_structure part_of alga
|
||||
lipid isa entity
|
||||
cell_or_molecular_dysfunction result_of disease_or_syndrome
|
||||
social_behavior associated_with professional_or_occupational_group
|
||||
cell produces hormone
|
||||
invertebrate isa entity
|
||||
organic_chemical causes cell_or_molecular_dysfunction
|
||||
acquired_abnormality result_of human_caused_phenomenon_or_process
|
||||
pathologic_function manifestation_of disease_or_syndrome
|
||||
chemical_viewed_functionally issue_in occupation_or_discipline
|
||||
experimental_model_of_disease co-occurs_with anatomical_abnormality
|
||||
laboratory_procedure assesses_effect_of element_ion_or_isotope
|
||||
diagnostic_procedure measures cell_function
|
||||
chemical_viewed_structurally issue_in occupation_or_discipline
|
||||
genetic_function affects disease_or_syndrome
|
||||
laboratory_or_test_result co-occurs_with sign_or_symptom
|
||||
amino_acid_peptide_or_protein interacts_with chemical_viewed_functionally
|
||||
cell part_of bacterium
|
||||
cell_function affects clinical_attribute
|
||||
fully_formed_anatomical_structure part_of plant
|
||||
chemical_viewed_structurally interacts_with lipid
|
||||
molecular_biology_research_technique measures molecular_function
|
||||
fungus interacts_with organism
|
||||
enzyme interacts_with vitamin
|
||||
congenital_abnormality manifestation_of mental_or_behavioral_dysfunction
|
||||
therapeutic_or_preventive_procedure complicates pathologic_function
|
||||
chemical affects organ_or_tissue_function
|
||||
virus location_of hormone
|
||||
organ_or_tissue_function produces hormone
|
||||
alga location_of neuroreactive_substance_or_biogenic_amine
|
||||
laboratory_procedure affects organ_or_tissue_function
|
||||
pathologic_function process_of invertebrate
|
||||
manufactured_object causes cell_or_molecular_dysfunction
|
||||
neoplastic_process affects rickettsia_or_chlamydia
|
||||
cell_or_molecular_dysfunction result_of acquired_abnormality
|
||||
genetic_function affects plant
|
||||
alga isa physical_object
|
||||
family_group performs laboratory_procedure
|
||||
disease_or_syndrome degree_of cell_or_molecular_dysfunction
|
||||
reptile exhibits social_behavior
|
||||
therapeutic_or_preventive_procedure affects patient_or_disabled_group
|
||||
qualitative_concept evaluation_of individual_behavior
|
||||
population_group uses regulation_or_law
|
||||
antibiotic causes cell_or_molecular_dysfunction
|
||||
cell_or_molecular_dysfunction occurs_in mental_or_behavioral_dysfunction
|
||||
acquired_abnormality manifestation_of genetic_function
|
||||
bacterium isa entity
|
||||
experimental_model_of_disease occurs_in age_group
|
||||
immunologic_factor causes congenital_abnormality
|
||||
laboratory_procedure measures pharmacologic_substance
|
||||
disease_or_syndrome affects fish
|
||||
biologic_function result_of neoplastic_process
|
||||
therapeutic_or_preventive_procedure associated_with acquired_abnormality
|
||||
cell produces vitamin
|
||||
mental_process process_of vertebrate
|
||||
mental_process result_of neoplastic_process
|
||||
diagnostic_procedure diagnoses cell_or_molecular_dysfunction
|
||||
rickettsia_or_chlamydia location_of vitamin
|
||||
neoplastic_process manifestation_of pathologic_function
|
||||
disease_or_syndrome precedes neoplastic_process
|
||||
physiologic_function result_of natural_phenomenon_or_process
|
||||
laboratory_or_test_result measurement_of food
|
||||
diagnostic_procedure assesses_effect_of element_ion_or_isotope
|
||||
vitamin causes cell_or_molecular_dysfunction
|
||||
carbohydrate_sequence isa idea_or_concept
|
||||
human_caused_phenomenon_or_process result_of injury_or_poisoning
|
||||
element_ion_or_isotope causes cell_or_molecular_dysfunction
|
||||
organic_chemical causes congenital_abnormality
|
||||
human_caused_phenomenon_or_process result_of experimental_model_of_disease
|
||||
experimental_model_of_disease complicates mental_or_behavioral_dysfunction
|
||||
organ_or_tissue_function isa natural_phenomenon_or_process
|
||||
nucleotide_sequence isa molecular_sequence
|
||||
physiologic_function affects fungus
|
||||
experimental_model_of_disease isa phenomenon_or_process
|
||||
cell_or_molecular_dysfunction manifestation_of injury_or_poisoning
|
||||
clinical_drug causes acquired_abnormality
|
||||
cell_component location_of genetic_function
|
||||
occupational_activity associated_with disease_or_syndrome
|
||||
laboratory_or_test_result associated_with anatomical_abnormality
|
||||
age_group performs social_behavior
|
||||
fully_formed_anatomical_structure location_of physiologic_function
|
||||
hormone interacts_with vitamin
|
||||
molecular_function precedes organism_function
|
||||
human_caused_phenomenon_or_process isa event
|
||||
professional_or_occupational_group performs diagnostic_procedure
|
||||
disease_or_syndrome co-occurs_with injury_or_poisoning
|
||||
mental_process produces biologically_active_substance
|
||||
molecular_function produces hormone
|
||||
neoplastic_process complicates congenital_abnormality
|
||||
neoplastic_process result_of mental_process
|
||||
eicosanoid issue_in biomedical_occupation_or_discipline
|
||||
health_care_related_organization isa entity
|
||||
cell_function isa biologic_function
|
||||
diagnostic_procedure analyzes pharmacologic_substance
|
||||
immunologic_factor complicates genetic_function
|
||||
physiologic_function precedes genetic_function
|
||||
immunologic_factor complicates physiologic_function
|
||||
mental_or_behavioral_dysfunction process_of mammal
|
||||
immunologic_factor indicates neoplastic_process
|
||||
neoplastic_process process_of genetic_function
|
||||
biologic_function affects fish
|
||||
organ_or_tissue_function affects bird
|
||||
mental_or_behavioral_dysfunction result_of environmental_effect_of_humans
|
||||
hazardous_or_poisonous_substance complicates neoplastic_process
|
||||
cell_or_molecular_dysfunction result_of social_behavior
|
||||
experimental_model_of_disease result_of human_caused_phenomenon_or_process
|
||||
element_ion_or_isotope interacts_with vitamin
|
||||
drug_delivery_device causes injury_or_poisoning
|
||||
fully_formed_anatomical_structure location_of fungus
|
||||
fully_formed_anatomical_structure location_of bacterium
|
||||
natural_phenomenon_or_process result_of disease_or_syndrome
|
||||
enzyme complicates experimental_model_of_disease
|
||||
individual_behavior manifestation_of mental_or_behavioral_dysfunction
|
||||
geographic_area isa idea_or_concept
|
||||
tissue isa fully_formed_anatomical_structure
|
||||
sign_or_symptom diagnoses experimental_model_of_disease
|
||||
educational_activity associated_with pathologic_function
|
||||
receptor affects biologic_function
|
||||
organ_or_tissue_function co-occurs_with physiologic_function
|
||||
mental_or_behavioral_dysfunction produces vitamin
|
||||
experimental_model_of_disease result_of physiologic_function
|
||||
hormone complicates physiologic_function
|
||||
self_help_or_relief_organization carries_out educational_activity
|
||||
environmental_effect_of_humans isa event
|
||||
chemical causes disease_or_syndrome
|
||||
diagnostic_procedure diagnoses congenital_abnormality
|
||||
cell_component part_of human
|
||||
experimental_model_of_disease result_of health_care_activity
|
||||
laboratory_or_test_result manifestation_of experimental_model_of_disease
|
||||
organism_attribute measurement_of mental_process
|
||||
cell_function affects genetic_function
|
||||
anatomical_structure part_of plant
|
||||
natural_phenomenon_or_process result_of pathologic_function
|
||||
congenital_abnormality result_of experimental_model_of_disease
|
||||
organism_function produces receptor
|
||||
food causes neoplastic_process
|
||||
hormone affects genetic_function
|
||||
diagnostic_procedure issue_in biomedical_occupation_or_discipline
|
||||
organ_or_tissue_function process_of mental_or_behavioral_dysfunction
|
||||
bird interacts_with archaeon
|
||||
laboratory_procedure analyzes organophosphorus_compound
|
||||
animal interacts_with organism
|
||||
laboratory_procedure assesses_effect_of disease_or_syndrome
|
||||
plant interacts_with alga
|
||||
therapeutic_or_preventive_procedure prevents neoplastic_process
|
||||
congenital_abnormality complicates anatomical_abnormality
|
||||
antibiotic disrupts organism_function
|
||||
age_group performs daily_or_recreational_activity
|
||||
gene_or_genome part_of plant
|
||||
amino_acid_peptide_or_protein interacts_with neuroreactive_substance_or_biogenic_amine
|
||||
pharmacologic_substance causes pathologic_function
|
||||
lipid issue_in occupation_or_discipline
|
||||
research_device causes anatomical_abnormality
|
||||
disease_or_syndrome process_of alga
|
||||
anatomical_abnormality result_of cell_function
|
||||
antibiotic treats experimental_model_of_disease
|
||||
antibiotic complicates mental_process
|
||||
injury_or_poisoning result_of cell_function
|
||||
physiologic_function precedes organ_or_tissue_function
|
||||
genetic_function result_of human_caused_phenomenon_or_process
|
||||
quantitative_concept measurement_of mental_process
|
||||
fungus causes pathologic_function
|
||||
rickettsia_or_chlamydia location_of immunologic_factor
|
||||
eicosanoid interacts_with element_ion_or_isotope
|
||||
inorganic_chemical causes neoplastic_process
|
||||
anatomical_structure issue_in biomedical_occupation_or_discipline
|
||||
immunologic_factor complicates injury_or_poisoning
|
||||
drug_delivery_device treats injury_or_poisoning
|
||||
research_device isa entity
|
||||
biologically_active_substance interacts_with neuroreactive_substance_or_biogenic_amine
|
||||
organophosphorus_compound interacts_with biologically_active_substance
|
||||
molecular_function affects amphibian
|
||||
mental_or_behavioral_dysfunction co-occurs_with injury_or_poisoning
|
||||
neoplastic_process manifestation_of experimental_model_of_disease
|
||||
bacterium location_of biologically_active_substance
|
||||
organic_chemical interacts_with biomedical_or_dental_material
|
||||
physiologic_function affects natural_phenomenon_or_process
|
||||
laboratory_procedure isa health_care_activity
|
||||
neoplastic_process complicates anatomical_abnormality
|
||||
anatomical_abnormality affects vertebrate
|
||||
clinical_attribute manifestation_of organ_or_tissue_function
|
||||
embryonic_structure part_of fungus
|
||||
inorganic_chemical interacts_with enzyme
|
||||
mental_or_behavioral_dysfunction co-occurs_with experimental_model_of_disease
|
||||
enzyme complicates neoplastic_process
|
||||
sign_or_symptom manifestation_of organ_or_tissue_function
|
||||
organ_or_tissue_function co-occurs_with molecular_function
|
||||
age_group isa group
|
||||
steroid affects neoplastic_process
|
||||
age_group exhibits behavior
|
||||
disease_or_syndrome manifestation_of physiologic_function
|
||||
diagnostic_procedure isa event
|
||||
biologically_active_substance disrupts gene_or_genome
|
||||
anatomical_abnormality manifestation_of mental_process
|
||||
cell_function result_of physiologic_function
|
||||
mental_process process_of human
|
||||
chemical issue_in biomedical_occupation_or_discipline
|
||||
alga interacts_with human
|
||||
vitamin affects biologic_function
|
||||
fully_formed_anatomical_structure produces carbohydrate
|
||||
environmental_effect_of_humans result_of acquired_abnormality
|
||||
disease_or_syndrome result_of human_caused_phenomenon_or_process
|
||||
organic_chemical interacts_with steroid
|
||||
cell_or_molecular_dysfunction process_of natural_phenomenon_or_process
|
||||
anatomical_abnormality part_of animal
|
||||
diagnostic_procedure uses drug_delivery_device
|
||||
molecular_biology_research_technique method_of diagnostic_procedure
|
||||
biologically_active_substance causes injury_or_poisoning
|
||||
anatomical_abnormality affects plant
|
||||
molecular_function process_of invertebrate
|
||||
diagnostic_procedure measures pharmacologic_substance
|
||||
element_ion_or_isotope affects molecular_function
|
||||
mental_or_behavioral_dysfunction result_of neoplastic_process
|
||||
machine_activity isa activity
|
||||
nucleic_acid_nucleoside_or_nucleotide interacts_with hormone
|
||||
laboratory_procedure affects neoplastic_process
|
||||
biomedical_or_dental_material isa chemical
|
||||
pathologic_function affects animal
|
||||
receptor causes pathologic_function
|
||||
indicator_reagent_or_diagnostic_aid causes anatomical_abnormality
|
||||
neuroreactive_substance_or_biogenic_amine affects cell_or_molecular_dysfunction
|
||||
cell_function affects rickettsia_or_chlamydia
|
||||
embryonic_structure location_of virus
|
||||
therapeutic_or_preventive_procedure affects cell_function
|
||||
human interacts_with organism
|
||||
fungus causes disease_or_syndrome
|
||||
cell produces receptor
|
||||
population_group produces regulation_or_law
|
||||
family_group performs research_activity
|
||||
vitamin causes injury_or_poisoning
|
||||
molecular_sequence issue_in biomedical_occupation_or_discipline
|
||||
steroid issue_in biomedical_occupation_or_discipline
|
||||
bacterium interacts_with fish
|
||||
cell_function result_of mental_process
|
||||
organism_attribute property_of mammal
|
||||
anatomical_abnormality manifestation_of disease_or_syndrome
|
||||
cell_or_molecular_dysfunction result_of environmental_effect_of_humans
|
||||
physiologic_function affects mammal
|
||||
fully_formed_anatomical_structure part_of bird
|
||||
organic_chemical interacts_with hormone
|
||||
idea_or_concept issue_in occupation_or_discipline
|
||||
patient_or_disabled_group uses research_device
|
||||
receptor causes acquired_abnormality
|
||||
biologic_function result_of disease_or_syndrome
|
||||
biologically_active_substance interacts_with enzyme
|
||||
physiologic_function isa biologic_function
|
||||
antibiotic complicates cell_function
|
||||
hazardous_or_poisonous_substance disrupts mental_process
|
||||
pathologic_function precedes cell_or_molecular_dysfunction
|
||||
organism_attribute property_of organism
|
||||
organophosphorus_compound interacts_with carbohydrate
|
||||
bacterium location_of receptor
|
||||
organ_or_tissue_function result_of experimental_model_of_disease
|
||||
fully_formed_anatomical_structure location_of organism_function
|
||||
finding isa conceptual_entity
|
||||
congenital_abnormality isa entity
|
||||
tissue issue_in biomedical_occupation_or_discipline
|
||||
natural_phenomenon_or_process result_of neoplastic_process
|
||||
organism_attribute manifestation_of organ_or_tissue_function
|
||||
therapeutic_or_preventive_procedure complicates cell_function
|
||||
population_group produces medical_device
|
||||
antibiotic interacts_with biologically_active_substance
|
||||
antibiotic causes acquired_abnormality
|
||||
cell_function produces vitamin
|
||||
neoplastic_process affects physiologic_function
|
||||
environmental_effect_of_humans result_of mental_or_behavioral_dysfunction
|
||||
organ_or_tissue_function affects organism_function
|
||||
lipid affects pathologic_function
|
||||
laboratory_procedure affects mental_process
|
||||
biologically_active_substance disrupts cell_component
|
||||
finding manifestation_of organism_function
|
||||
organism_function affects bird
|
||||
genetic_function affects physiologic_function
|
||||
cell_function result_of genetic_function
|
||||
antibiotic affects physiologic_function
|
||||
organophosphorus_compound causes pathologic_function
|
||||
natural_phenomenon_or_process affects genetic_function
|
||||
neoplastic_process produces receptor
|
||||
laboratory_procedure measures biomedical_or_dental_material
|
||||
organism_attribute measurement_of molecular_function
|
||||
physiologic_function affects biologic_function
|
||||
experimental_model_of_disease result_of neoplastic_process
|
||||
alga interacts_with virus
|
||||
congenital_abnormality location_of fungus
|
||||
antibiotic diagnoses cell_or_molecular_dysfunction
|
||||
diagnostic_procedure measures temporal_concept
|
||||
mental_or_behavioral_dysfunction result_of biologic_function
|
||||
pharmacologic_substance complicates biologic_function
|
||||
pharmacologic_substance disrupts organism_function
|
||||
anatomical_abnormality result_of injury_or_poisoning
|
||||
fully_formed_anatomical_structure location_of molecular_function
|
||||
nucleic_acid_nucleoside_or_nucleotide interacts_with antibiotic
|
||||
neuroreactive_substance_or_biogenic_amine isa biologically_active_substance
|
||||
experimental_model_of_disease process_of bacterium
|
||||
neuroreactive_substance_or_biogenic_amine interacts_with chemical
|
||||
cell_or_molecular_dysfunction affects bird
|
||||
laboratory_or_test_result isa conceptual_entity
|
||||
pathologic_function associated_with organism_attribute
|
||||
acquired_abnormality co-occurs_with injury_or_poisoning
|
||||
professional_or_occupational_group uses drug_delivery_device
|
||||
professional_or_occupational_group diagnoses experimental_model_of_disease
|
||||
cell_or_molecular_dysfunction degree_of neoplastic_process
|
||||
neoplastic_process issue_in biomedical_occupation_or_discipline
|
||||
disease_or_syndrome result_of mental_process
|
||||
neoplastic_process process_of bird
|
||||
pathologic_function result_of anatomical_abnormality
|
||||
congenital_abnormality manifestation_of disease_or_syndrome
|
||||
organism_attribute result_of neoplastic_process
|
||||
injury_or_poisoning issue_in occupation_or_discipline
|
||||
receptor causes mental_or_behavioral_dysfunction
|
||||
clinical_attribute property_of bacterium
|
||||
nucleic_acid_nucleoside_or_nucleotide affects experimental_model_of_disease
|
||||
lipid causes congenital_abnormality
|
||||
chemical_viewed_structurally interacts_with chemical_viewed_functionally
|
||||
antibiotic prevents pathologic_function
|
||||
eicosanoid isa organic_chemical
|
||||
biologically_active_substance disrupts organ_or_tissue_function
|
||||
organ_or_tissue_function affects genetic_function
|
||||
antibiotic affects neoplastic_process
|
||||
fully_formed_anatomical_structure location_of virus
|
||||
qualitative_concept evaluation_of activity
|
||||
embryonic_structure part_of cell
|
||||
enzyme disrupts tissue
|
||||
governmental_or_regulatory_activity associated_with disease_or_syndrome
|
||||
gene_or_genome location_of mental_process
|
||||
neoplastic_process process_of organism_function
|
||||
pharmacologic_substance issue_in biomedical_occupation_or_discipline
|
||||
receptor complicates disease_or_syndrome
|
||||
disease_or_syndrome process_of genetic_function
|
||||
anatomical_abnormality location_of virus
|
||||
embryonic_structure part_of vertebrate
|
||||
organism_function affects experimental_model_of_disease
|
||||
manufactured_object causes mental_or_behavioral_dysfunction
|
||||
cell part_of body_part_organ_or_organ_component
|
||||
molecular_function result_of experimental_model_of_disease
|
||||
medical_device treats acquired_abnormality
|
||||
disease_or_syndrome affects human
|
||||
body_part_organ_or_organ_component location_of molecular_function
|
||||
disease_or_syndrome occurs_in neoplastic_process
|
||||
vitamin isa chemical_viewed_functionally
|
||||
cell_component issue_in occupation_or_discipline
|
||||
cell_component produces nucleic_acid_nucleoside_or_nucleotide
|
||||
bacterium isa organism
|
||||
cell_or_molecular_dysfunction occurs_in injury_or_poisoning
|
||||
hazardous_or_poisonous_substance issue_in occupation_or_discipline
|
||||
organization location_of educational_activity
|
||||
tissue produces biologically_active_substance
|
||||
fungus isa physical_object
|
||||
organism_function result_of phenomenon_or_process
|
||||
organism_function isa biologic_function
|
||||
organic_chemical interacts_with nucleic_acid_nucleoside_or_nucleotide
|
||||
organic_chemical affects natural_phenomenon_or_process
|
||||
diagnostic_procedure associated_with neoplastic_process
|
||||
molecular_function produces neuroreactive_substance_or_biogenic_amine
|
||||
mental_or_behavioral_dysfunction process_of bird
|
||||
chemical_viewed_structurally affects biologic_function
|
||||
experimental_model_of_disease produces biologically_active_substance
|
||||
mental_or_behavioral_dysfunction associated_with organism_attribute
|
||||
laboratory_procedure analyzes neuroreactive_substance_or_biogenic_amine
|
||||
organic_chemical isa substance
|
||||
mental_or_behavioral_dysfunction affects plant
|
||||
daily_or_recreational_activity associated_with experimental_model_of_disease
|
||||
mental_process precedes organ_or_tissue_function
|
||||
chemical affects neoplastic_process
|
||||
hormone ingredient_of clinical_drug
|
||||
hormone isa biologically_active_substance
|
||||
molecular_biology_research_technique measures receptor
|
||||
experimental_model_of_disease precedes neoplastic_process
|
||||
pharmacologic_substance interacts_with neuroreactive_substance_or_biogenic_amine
|
||||
virus interacts_with fish
|
||||
acquired_abnormality affects virus
|
||||
pathologic_function result_of molecular_function
|
||||
embryonic_structure location_of fungus
|
||||
pharmacologic_substance affects cell_or_molecular_dysfunction
|
||||
biologic_function result_of acquired_abnormality
|
||||
neoplastic_process co-occurs_with anatomical_abnormality
|
||||
neoplastic_process result_of acquired_abnormality
|
||||
body_part_organ_or_organ_component produces body_substance
|
||||
cell_or_molecular_dysfunction process_of mental_or_behavioral_dysfunction
|
||||
educational_activity issue_in occupation_or_discipline
|
||||
pathologic_function manifestation_of neoplastic_process
|
||||
virus causes pathologic_function
|
||||
injury_or_poisoning complicates experimental_model_of_disease
|
||||
eicosanoid interacts_with pharmacologic_substance
|
||||
molecular_function result_of natural_phenomenon_or_process
|
||||
neoplastic_process produces tissue
|
||||
diagnostic_procedure assesses_effect_of vitamin
|
||||
anatomical_abnormality manifestation_of organ_or_tissue_function
|
||||
hazardous_or_poisonous_substance ingredient_of clinical_drug
|
||||
organophosphorus_compound affects cell_or_molecular_dysfunction
|
||||
laboratory_procedure measures organism_attribute
|
||||
chemical_viewed_functionally interacts_with immunologic_factor
|
||||
diagnostic_procedure diagnoses disease_or_syndrome
|
||||
injury_or_poisoning complicates disease_or_syndrome
|
||||
molecular_function result_of congenital_abnormality
|
||||
biologic_function affects bacterium
|
||||
organism_function produces hormone
|
||||
individual_behavior associated_with neoplastic_process
|
||||
natural_phenomenon_or_process affects cell_function
|
||||
daily_or_recreational_activity associated_with pathologic_function
|
||||
fully_formed_anatomical_structure location_of rickettsia_or_chlamydia
|
||||
organ_or_tissue_function affects pathologic_function
|
||||
neoplastic_process affects amphibian
|
||||
acquired_abnormality occurs_in age_group
|
||||
mental_process affects organism_attribute
|
||||
molecular_biology_research_technique measures neoplastic_process
|
||||
disease_or_syndrome occurs_in patient_or_disabled_group
|
||||
mental_or_behavioral_dysfunction affects mammal
|
||||
environmental_effect_of_humans isa phenomenon_or_process
|
||||
cell_or_molecular_dysfunction precedes experimental_model_of_disease
|
||||
laboratory_or_test_result isa entity
|
||||
virus interacts_with archaeon
|
||||
indicator_reagent_or_diagnostic_aid causes mental_or_behavioral_dysfunction
|
||||
anatomical_structure part_of fungus
|
||||
cell_or_molecular_dysfunction process_of bird
|
|
5216
dataset/umls/train.tsv
Normal file
5216
dataset/umls/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
2
lit_models/__init__.py
Normal file
2
lit_models/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from .transformer import *
|
||||
from .base import *
|
97
lit_models/base.py
Normal file
97
lit_models/base.py
Normal file
@ -0,0 +1,97 @@
|
||||
import argparse
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
OPTIMIZER = "AdamW"
|
||||
LR = 5e-5
|
||||
LOSS = "cross_entropy"
|
||||
ONE_CYCLE_TOTAL_STEPS = 100
|
||||
|
||||
class Config(dict):
|
||||
def __getattr__(self, name):
|
||||
return self.get(name)
|
||||
|
||||
def __setattr__(self, name, val):
|
||||
self[name] = val
|
||||
|
||||
|
||||
class BaseLitModel(pl.LightningModule):
|
||||
"""
|
||||
Generic PyTorch-Lightning class that must be initialized with a PyTorch module.
|
||||
"""
|
||||
|
||||
def __init__(self, model, args: argparse.Namespace = None):
|
||||
super().__init__()
|
||||
self.model = model
|
||||
self.args = Config(vars(args)) if args is not None else {}
|
||||
|
||||
optimizer = self.args.get("optimizer", OPTIMIZER)
|
||||
self.optimizer_class = getattr(torch.optim, optimizer)
|
||||
self.lr = self.args.get("lr", LR)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
parser.add_argument("--optimizer", type=str, default=OPTIMIZER, help="optimizer class from torch.optim")
|
||||
parser.add_argument("--lr", type=float, default=LR)
|
||||
parser.add_argument("--weight_decay", type=float, default=0.01)
|
||||
return parser
|
||||
|
||||
def configure_optimizers(self):
|
||||
optimizer = self.optimizer_class(self.parameters(), lr=self.lr)
|
||||
if self.one_cycle_max_lr is None:
|
||||
return optimizer
|
||||
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, max_lr=self.one_cycle_max_lr, total_steps=self.one_cycle_total_steps)
|
||||
return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
|
||||
|
||||
def forward(self, x):
|
||||
return self.model(x)
|
||||
|
||||
def training_step(self, batch, batch_idx): # pylint: disable=unused-argument
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = self.loss_fn(logits, y)
|
||||
self.log("train_loss", loss)
|
||||
self.train_acc(logits, y)
|
||||
self.log("train_acc", self.train_acc, on_step=False, on_epoch=True)
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx): # pylint: disable=unused-argument
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
loss = self.loss_fn(logits, y)
|
||||
self.log("val_loss", loss, prog_bar=True)
|
||||
self.val_acc(logits, y)
|
||||
self.log("val_acc", self.val_acc, on_step=False, on_epoch=True, prog_bar=True)
|
||||
|
||||
def test_step(self, batch, batch_idx): # pylint: disable=unused-argument
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
self.test_acc(logits, y)
|
||||
self.log("test_acc", self.test_acc, on_step=False, on_epoch=True)
|
||||
|
||||
@property
|
||||
def num_training_steps(self) -> int:
|
||||
"""Total training steps inferred from datamodule and devices."""
|
||||
if isinstance(self.trainer.limit_train_batches, int) and self.trainer.limit_train_batches != 0:
|
||||
dataset_size = self.trainer.limit_train_batches
|
||||
elif isinstance(self.trainer.limit_train_batches, float):
|
||||
# limit_train_batches is a percentage of batches
|
||||
dataset_size = len(self.trainer.datamodule.train_dataloader())
|
||||
dataset_size = int(dataset_size * self.trainer.limit_train_batches)
|
||||
else:
|
||||
dataset_size = len(self.trainer.datamodule.train_dataloader())
|
||||
|
||||
num_devices = max(1, self.trainer.num_gpus, self.trainer.num_processes)
|
||||
if self.trainer.tpu_cores:
|
||||
num_devices = max(num_devices, self.trainer.tpu_cores)
|
||||
|
||||
effective_batch_size = self.trainer.accumulate_grad_batches * num_devices
|
||||
max_estimated_steps = (dataset_size // effective_batch_size) * self.trainer.max_epochs
|
||||
|
||||
if self.trainer.max_steps and self.trainer.max_steps < max_estimated_steps:
|
||||
return self.trainer.max_steps
|
||||
return max_estimated_steps
|
||||
|
521
lit_models/transformer.py
Normal file
521
lit_models/transformer.py
Normal file
@ -0,0 +1,521 @@
|
||||
from logging import debug
|
||||
import random
|
||||
from turtle import distance
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
import pickle
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
import json
|
||||
# from transformers.utils.dummy_pt_objects import PrefixConstrainedLogitsProcessor
|
||||
|
||||
from .base import BaseLitModel
|
||||
from transformers.optimization import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
|
||||
|
||||
from functools import partial
|
||||
from .utils import rank_score, acc, LabelSmoothSoftmaxCEV1
|
||||
|
||||
from typing import Callable, Iterable, List
|
||||
|
||||
def pad_distance(pad_length, distance):
|
||||
pad = nn.ConstantPad2d(padding=(0, pad_length, 0, pad_length), value=float('-inf'))
|
||||
distance = pad(distance)
|
||||
return distance
|
||||
|
||||
def lmap(f: Callable, x: Iterable) -> List:
|
||||
"""list(map(f, x))"""
|
||||
return list(map(f, x))
|
||||
|
||||
def multilabel_categorical_crossentropy(y_pred, y_true):
|
||||
y_pred = (1 - 2 * y_true) * y_pred
|
||||
y_pred_neg = y_pred - y_true * 1e12
|
||||
y_pred_pos = y_pred - (1 - y_true) * 1e12
|
||||
zeros = torch.zeros_like(y_pred[..., :1])
|
||||
y_pred_neg = torch.cat([y_pred_neg, zeros], dim=-1)
|
||||
y_pred_pos = torch.cat([y_pred_pos, zeros], dim=-1)
|
||||
neg_loss = torch.logsumexp(y_pred_neg, dim=-1)
|
||||
pos_loss = torch.logsumexp(y_pred_pos, dim=-1)
|
||||
return (neg_loss + pos_loss).mean()
|
||||
|
||||
def decode(output_ids, tokenizer):
|
||||
return lmap(str.strip, tokenizer.batch_decode(output_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True))
|
||||
|
||||
class TransformerLitModel(BaseLitModel):
|
||||
def __init__(self, model, args, tokenizer=None, data_config={}):
|
||||
super().__init__(model, args)
|
||||
self.save_hyperparameters(args)
|
||||
if args.bce:
|
||||
self.loss_fn = torch.nn.BCEWithLogitsLoss()
|
||||
elif args.label_smoothing != 0.0:
|
||||
self.loss_fn = LabelSmoothSoftmaxCEV1(lb_smooth=args.label_smoothing)
|
||||
else:
|
||||
self.loss_fn = nn.CrossEntropyLoss()
|
||||
self.best_acc = 0
|
||||
self.first = True
|
||||
|
||||
self.tokenizer = tokenizer
|
||||
self.num_heads = 12
|
||||
self.__dict__.update(data_config)
|
||||
# resize the word embedding layer
|
||||
self.model.resize_token_embeddings(len(self.tokenizer))
|
||||
self.decode = partial(decode, tokenizer=self.tokenizer)
|
||||
if args.pretrain:
|
||||
self._freaze_attention()
|
||||
elif "ind" in args.data_dir:
|
||||
# for inductive setting, use feeaze the word embedding
|
||||
self._freaze_word_embedding()
|
||||
|
||||
self.spatial_pos_encoder = nn.Embedding(5, self.num_heads, padding_idx=0)
|
||||
self.graph_token_virtual_distance = nn.Embedding(1, self.num_heads)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
return self.model(x)
|
||||
|
||||
def training_step(self, batch, batch_idx): # pylint: disable=unused-argument
|
||||
# embed();exit()
|
||||
# print(self.optimizers().param_groups[1]['lr'])
|
||||
labels = batch.pop("labels")
|
||||
label = batch.pop("label")
|
||||
pos = batch.pop("pos")
|
||||
try:
|
||||
en = batch.pop("en")
|
||||
rel = batch.pop("rel")
|
||||
except KeyError:
|
||||
pass
|
||||
input_ids = batch['input_ids']
|
||||
|
||||
distance_attention = torch.stack([pad_distance(len(input_ids[i]) - len(distance) - 1, distance) for i, distance in enumerate(batch['distance_attention'])])
|
||||
distance = batch.pop("distance_attention")
|
||||
graph_attn_bias = torch.zeros(input_ids.size(0), input_ids.size(1), input_ids.size(1), device='cuda')
|
||||
graph_attn_bias[:, 1:, 1:][distance_attention == float('-inf')] = float('-inf')
|
||||
graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
|
||||
distance_attention = self.spatial_pos_encoder(distance_attention.long()).permute(0, 3, 1, 2)
|
||||
graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + distance_attention
|
||||
|
||||
if self.args.use_global_node:
|
||||
t = self.graph_token_virtual_distance.weight.view(1, self.num_heads, 1)
|
||||
graph_attn_bias[:, :, 1:, 0] = graph_attn_bias[:, :, 1:, 0] + t
|
||||
graph_attn_bias[:, :, 0, :] = graph_attn_bias[:, :, 0, :] + t
|
||||
|
||||
if self.args.add_attn_bias:
|
||||
logits = self.model(**batch, return_dict=True, distance_attention=graph_attn_bias).logits
|
||||
else:
|
||||
logits = self.model(**batch, return_dict=True, distance_attention=None).logits
|
||||
|
||||
_, mask_idx = (input_ids == self.tokenizer.mask_token_id).nonzero(as_tuple=True)
|
||||
bs = input_ids.shape[0]
|
||||
mask_logits = logits[torch.arange(bs), mask_idx][:, self.entity_id_st:self.entity_id_ed]
|
||||
|
||||
assert mask_idx.shape[0] == bs, "only one mask in sequence!"
|
||||
if self.args.bce:
|
||||
loss = self.loss_fn(mask_logits, labels)
|
||||
else:
|
||||
loss = self.loss_fn(mask_logits, label)
|
||||
|
||||
if batch_idx == 0:
|
||||
print('\n'.join(self.decode(batch['input_ids'][:4])))
|
||||
|
||||
|
||||
return loss
|
||||
|
||||
def _eval(self, batch, batch_idx, ):
|
||||
labels = batch.pop("labels")
|
||||
input_ids = batch['input_ids']
|
||||
# single label
|
||||
label = batch.pop('label')
|
||||
pos = batch.pop('pos')
|
||||
distance_attention = torch.stack([pad_distance(len(input_ids[i]) - len(distance) - 1, distance) for i, distance in enumerate(batch['distance_attention'])])
|
||||
distance = batch.pop("distance_attention")
|
||||
graph_attn_bias = torch.zeros(input_ids.size(0), input_ids.size(1), input_ids.size(1), device='cuda')
|
||||
graph_attn_bias[:, 1:, 1:][distance_attention == float('-inf')] = float('-inf')
|
||||
graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
|
||||
distance_attention = self.spatial_pos_encoder(distance_attention.long()).permute(0, 3, 1, 2)
|
||||
graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + distance_attention
|
||||
# distance_attention = torch.stack([pad_distance(len(input_ids[i]) - len(distance), distance) for i, distance in enumerate(batch['distance_attention'])])
|
||||
# distance = batch.pop("distance_attention")
|
||||
# distance_attention = self.spatial_pos_encoder(distance_attention.long()).permute(0, 3, 1, 2)
|
||||
my_keys = list(batch.keys())
|
||||
for k in my_keys:
|
||||
if k not in ["input_ids", "attention_mask", "token_type_ids"]:
|
||||
batch.pop(k)
|
||||
|
||||
if self.args.add_attn_bias:
|
||||
logits = self.model(**batch, return_dict=True, distance_attention=graph_attn_bias).logits[:, :, self.entity_id_st:self.entity_id_ed]
|
||||
else:
|
||||
logits = self.model(**batch, return_dict=True, distance_attention=None).logits[:, :, self.entity_id_st:self.entity_id_ed]
|
||||
_, mask_idx = (input_ids == self.tokenizer.mask_token_id).nonzero(as_tuple=True)
|
||||
bsz = input_ids.shape[0]
|
||||
logits = logits[torch.arange(bsz), mask_idx]
|
||||
# get the entity ranks
|
||||
# filter the entity
|
||||
assert labels[0][label[0]], "correct ids must in filiter!"
|
||||
labels[torch.arange(bsz), label] = 0
|
||||
assert logits.shape == labels.shape
|
||||
logits += labels * -100 # mask entityj
|
||||
# for i in range(bsz):
|
||||
# logits[i][labels]
|
||||
|
||||
_, outputs = torch.sort(logits, dim=1, descending=True)
|
||||
_, outputs = torch.sort(outputs, dim=1)
|
||||
ranks = outputs[torch.arange(bsz), label].detach().cpu() + 1
|
||||
|
||||
|
||||
return dict(ranks = np.array(ranks))
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
result = self._eval(batch, batch_idx)
|
||||
return result
|
||||
|
||||
def validation_epoch_end(self, outputs) -> None:
|
||||
ranks = np.concatenate([_['ranks'] for _ in outputs])
|
||||
total_ranks = ranks.shape[0]
|
||||
|
||||
if not self.args.pretrain:
|
||||
l_ranks = ranks[np.array(list(np.arange(0, total_ranks, 2)))]
|
||||
r_ranks = ranks[np.array(list(np.arange(0, total_ranks, 2))) + 1]
|
||||
self.log("Eval/lhits10", (l_ranks<=10).mean())
|
||||
self.log("Eval/rhits10", (r_ranks<=10).mean())
|
||||
|
||||
hits20 = (ranks<=20).mean()
|
||||
hits10 = (ranks<=10).mean()
|
||||
hits3 = (ranks<=3).mean()
|
||||
hits1 = (ranks<=1).mean()
|
||||
|
||||
self.log("Eval/hits10", hits10)
|
||||
self.log("Eval/hits20", hits20)
|
||||
self.log("Eval/hits3", hits3)
|
||||
self.log("Eval/hits1", hits1)
|
||||
self.log("Eval/mean_rank", ranks.mean())
|
||||
self.log("Eval/mrr", (1. / ranks).mean())
|
||||
self.log("hits10", hits10, prog_bar=True)
|
||||
self.log("hits1", hits1, prog_bar=True)
|
||||
|
||||
|
||||
def test_step(self, batch, batch_idx): # pylint: disable=unused-argument
|
||||
# ranks = self._eval(batch, batch_idx)
|
||||
result = self._eval(batch, batch_idx)
|
||||
# self.log("Test/ranks", np.mean(ranks))
|
||||
|
||||
return result
|
||||
|
||||
def test_epoch_end(self, outputs) -> None:
|
||||
ranks = np.concatenate([_['ranks'] for _ in outputs])
|
||||
|
||||
hits20 = (ranks<=20).mean()
|
||||
hits10 = (ranks<=10).mean()
|
||||
hits3 = (ranks<=3).mean()
|
||||
hits1 = (ranks<=1).mean()
|
||||
|
||||
self.log("Test/hits10", hits10)
|
||||
self.log("Test/hits20", hits20)
|
||||
self.log("Test/hits3", hits3)
|
||||
self.log("Test/hits1", hits1)
|
||||
self.log("Test/mean_rank", ranks.mean())
|
||||
self.log("Test/mrr", (1. / ranks).mean())
|
||||
|
||||
def configure_optimizers(self):
|
||||
no_decay_param = ["bias", "LayerNorm.weight"]
|
||||
|
||||
optimizer_group_parameters = [
|
||||
{"params": [p for n, p in self.model.named_parameters() if p.requires_grad and not any(nd in n for nd in no_decay_param)], "weight_decay": self.args.weight_decay},
|
||||
{"params": [p for n, p in self.model.named_parameters() if p.requires_grad and any(nd in n for nd in no_decay_param)], "weight_decay": 0}
|
||||
]
|
||||
|
||||
optimizer = self.optimizer_class(optimizer_group_parameters, lr=self.lr, eps=1e-8)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.num_training_steps * self.args.warm_up_radio, num_training_steps=self.num_training_steps)
|
||||
return {
|
||||
"optimizer": optimizer,
|
||||
"lr_scheduler":{
|
||||
'scheduler': scheduler,
|
||||
'interval': 'step', # or 'epoch'
|
||||
'frequency': 1,
|
||||
}
|
||||
}
|
||||
|
||||
def _freaze_attention(self):
|
||||
for k, v in self.model.named_parameters():
|
||||
if "word" not in k:
|
||||
v.requires_grad = False
|
||||
else:
|
||||
print(k)
|
||||
|
||||
def _freaze_word_embedding(self):
|
||||
for k, v in self.model.named_parameters():
|
||||
if "word" in k:
|
||||
print(k)
|
||||
v.requires_grad = False
|
||||
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
parser = BaseLitModel.add_to_argparse(parser)
|
||||
|
||||
parser.add_argument("--label_smoothing", type=float, default=0.1, help="")
|
||||
parser.add_argument("--bce", type=int, default=0, help="")
|
||||
return parser
|
||||
|
||||
|
||||
|
||||
import faiss
|
||||
import os
|
||||
class GetEntityEmbeddingLitModel(TransformerLitModel):
|
||||
def __init__(self, model, args, tokenizer, data_config={}):
|
||||
super().__init__(model, args, tokenizer, data_config)
|
||||
|
||||
self.faissid2entityid = {}
|
||||
# self.index = faiss.IndexFlatL2(d) # build the index
|
||||
|
||||
d, measure = self.model.config.hidden_size, faiss.METRIC_L2
|
||||
# param = 'HNSW64'
|
||||
# self.index = faiss.index_factory(d, param, measure)
|
||||
self.index = faiss.IndexFlatL2(d) # build the index
|
||||
# print(self.index.is_trained) # 此时输出为True
|
||||
# index.add(xb)
|
||||
self.cnt_batch = 0
|
||||
self.total_embedding = []
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
labels = batch.pop("labels")
|
||||
mask_idx = batch.pop("pos")
|
||||
input_ids = batch['input_ids']
|
||||
# single label
|
||||
label = batch.pop('label')
|
||||
# last layer
|
||||
hidden_states = self.model(**batch, return_dict=True, output_hidden_states=True).hidden_states[-1]
|
||||
# _, mask_idx = (input_ids == self.tokenizer.mask_token_id).nonzero(as_tuple=True)
|
||||
bsz = input_ids.shape[0]
|
||||
entity_embedding = hidden_states[torch.arange(bsz), mask_idx].cpu()
|
||||
# use normalize or not ?
|
||||
# entity_embedding = F.normalize(entity_embedding, dim=-1, p = 2)
|
||||
self.total_embedding.append(entity_embedding)
|
||||
# self.index.add(np.array(entity_embedding, dtype=np.float32))
|
||||
for i, l in zip(range(bsz), label):
|
||||
self.faissid2entityid[i+self.cnt_batch] = l.cpu()
|
||||
self.cnt_batch += bsz
|
||||
|
||||
|
||||
def test_epoch_end(self, outputs) -> None:
|
||||
self.total_embedding = np.concatenate(self.total_embedding, axis=0)
|
||||
# self.index.train(self.total_embedding)
|
||||
print(faiss.MatrixStats(self.total_embedding).comments)
|
||||
self.index.add(self.total_embedding)
|
||||
faiss.write_index(self.index, os.path.join(self.args.data_dir, "faiss_dump.index"))
|
||||
with open(os.path.join(self.args.data_dir, "faissid2entityid.pkl") ,'wb') as file:
|
||||
pickle.dump(self.faissid2entityid, file)
|
||||
|
||||
with open(os.path.join(self.args.data_dir, "total_embedding.pkl") ,'wb') as file:
|
||||
pickle.dump(self.total_embedding, file)
|
||||
# print(f"number of entity embedding : {len(self.faissid2entityid)}")
|
||||
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
parser = TransformerLitModel.add_to_argparse(parser)
|
||||
parser.add_argument("--faiss_init", type=int, default=1, help="get the embedding and save it the file.")
|
||||
return parser
|
||||
|
||||
class UseEntityEmbeddingLitModel(TransformerLitModel):
|
||||
def __init__(self, model, args, tokenizer, data_config={}):
|
||||
super().__init__(model, args, tokenizer, data_config)
|
||||
|
||||
self.faissid2entityid = pickle.load(open(os.path.join(self.args.data_dir, "faissid2entityid.pkl") ,'rb'))
|
||||
self.index = faiss.read_index(os.path.join(self.args.data_dir, "faiss_dump.index"))
|
||||
|
||||
|
||||
self.dis2logits = distance2logits_2
|
||||
|
||||
def _eval(self, batch, batch_idx, ):
|
||||
labels = batch.pop("labels")
|
||||
pos = batch.pop("pos")
|
||||
input_ids = batch['input_ids']
|
||||
# single label
|
||||
label = batch.pop('label')
|
||||
|
||||
hidden_states = self.model(**batch, return_dict=True, output_hidden_states=True).hidden_states[-1]
|
||||
_, mask_idx = (input_ids == self.tokenizer.mask_token_id).nonzero(as_tuple=True)
|
||||
bsz = input_ids.shape[0]
|
||||
mask_embedding = np.array(hidden_states[torch.arange(bsz), mask_idx].cpu(), dtype=np.float32)
|
||||
topk = 200
|
||||
D, I = self.index.search(mask_embedding, topk)
|
||||
labels[torch.arange(bsz), label] = 0
|
||||
|
||||
entity_logits = torch.full(labels.shape, -100.).to(self.device)
|
||||
D = self.dis2logits(D)
|
||||
for i in range(bsz):
|
||||
for j in range(topk):
|
||||
# filter entity in labels
|
||||
if I[i][j] not in self.faissid2entityid:
|
||||
print(I[i][j])
|
||||
break
|
||||
# assert I[i][j] in self.faissid2entityid, print(I[i][j])
|
||||
if labels[i][self.faissid2entityid[I[i][j]]]: continue
|
||||
if entity_logits[i][self.faissid2entityid[I[i][j]]] == -100.:
|
||||
entity_logits[i][self.faissid2entityid[I[i][j]]] = D[i][j]
|
||||
# no added together
|
||||
# else:
|
||||
# entity_logits[i][self.faissid2entityid[I[i][j]]] += D[i][j]
|
||||
# get the entity ranks
|
||||
# filter the entity
|
||||
|
||||
assert entity_logits.shape == labels.shape
|
||||
|
||||
_, outputs = torch.sort(entity_logits, dim=1, descending=True)
|
||||
_, outputs = torch.sort(outputs, dim=1)
|
||||
ranks = outputs[torch.arange(bsz), label].detach().cpu() + 1
|
||||
|
||||
|
||||
return dict(ranks = np.array(ranks))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
parser = TransformerLitModel.add_to_argparse(parser)
|
||||
parser.add_argument("--faiss_init", type=int, default=0, help="get the embedding and save it the file.")
|
||||
parser.add_argument("--faiss_use", type=int, default=1, help="get the embedding and save it the file.")
|
||||
return parser
|
||||
|
||||
|
||||
class CombineEntityEmbeddingLitModel(UseEntityEmbeddingLitModel):
|
||||
def __init__(self, model, args, tokenizer, data_config={}):
|
||||
super().__init__(model, args, tokenizer, data_config=data_config)
|
||||
self.dis2logits = distance2logits_2
|
||||
self.id2entity = {}
|
||||
with open("./dataset/FB15k-237/entity2textlong.txt", 'r') as file:
|
||||
cnt = 0
|
||||
for line in file.readlines():
|
||||
e, d = line.strip().split("\t")
|
||||
self.id2entity[cnt] = e
|
||||
cnt += 1
|
||||
self.id2entity_t = {}
|
||||
with open("./dataset/FB15k-237/entity2text.txt", 'r') as file:
|
||||
for line in file.readlines():
|
||||
e, d = line.strip().split("\t")
|
||||
self.id2entity_t[e] = d
|
||||
for k, v in self.id2entity.items():
|
||||
self.id2entity[k] = self.id2entity_t[v]
|
||||
def _eval(self, batch, batch_idx, ):
|
||||
labels = batch.pop("labels")
|
||||
input_ids = batch['input_ids']
|
||||
# single label
|
||||
label = batch.pop('label')
|
||||
pos = batch.pop("pos")
|
||||
|
||||
result = self.model(**batch, return_dict=True, output_hidden_states=True)
|
||||
hidden_states = result.hidden_states[-1]
|
||||
_, mask_idx = (input_ids == self.tokenizer.mask_token_id).nonzero(as_tuple=True)
|
||||
bsz = input_ids.shape[0]
|
||||
mask_embedding = np.array(hidden_states[torch.arange(bsz), mask_idx].cpu(), dtype=np.float32)
|
||||
# mask_embedding = np.array(hidden_states[torch.arange(bsz), mask_idx].cpu(), dtype=np.float32)
|
||||
topk = self.args.knn_topk
|
||||
D, I = self.index.search(mask_embedding, topk)
|
||||
D = torch.from_numpy(D).to(self.device)
|
||||
assert labels[0][label[0]], "correct ids must in filiter!"
|
||||
labels[torch.arange(bsz), label] = 0
|
||||
|
||||
|
||||
mask_logits = result.logits[:, :, self.entity_id_st:self.entity_id_ed]
|
||||
mask_logits = mask_logits[torch.arange(bsz), mask_idx]
|
||||
entity_logits = torch.full(labels.shape, 1000.).to(self.device)
|
||||
# D = self.dis2logits(D)
|
||||
for i in range(bsz):
|
||||
for j in range(topk):
|
||||
# filter entity in labels
|
||||
if labels[i][self.faissid2entityid[I[i][j]]]: continue
|
||||
if entity_logits[i][self.faissid2entityid[I[i][j]]] == 1000.:
|
||||
entity_logits[i][self.faissid2entityid[I[i][j]]] = D[i][j]
|
||||
# else:
|
||||
# entity_logits[i][self.faissid2entityid[I[i][j]]] += D[i][j]
|
||||
entity_logits = self.dis2logits(entity_logits)
|
||||
# get the entity ranks
|
||||
# filter the entity
|
||||
assert entity_logits.shape == labels.shape
|
||||
assert mask_logits.shape == labels.shape
|
||||
# entity_logits = torch.softmax(entity_logits + labels * -100, dim=-1) # mask entityj
|
||||
entity_logits = entity_logits + labels* -100.
|
||||
mask_logits = torch.softmax(mask_logits + labels* -100, dim=-1)
|
||||
# logits = mask_logits
|
||||
logits = combine_knn_and_vocab_probs(entity_logits, mask_logits, self.args.knn_lambda)
|
||||
# logits = entity_logits + mask_logits
|
||||
|
||||
|
||||
knn_topk_logits, knn_topk_id = entity_logits.topk(20)
|
||||
mask_topk_logits, mask_topk_id = mask_logits.topk(20)
|
||||
union_topk = []
|
||||
for i in range(bsz):
|
||||
num_same = len(list(set(knn_topk_id[i].cpu().tolist()) & set(mask_topk_id[i].cpu().tolist())))
|
||||
union_topk.append(num_same/ 20.)
|
||||
|
||||
knn_topk_id = knn_topk_id.to("cpu")
|
||||
mask_topk_id = mask_topk_id.to("cpu")
|
||||
mask_topk_logits = mask_topk_logits.to("cpu")
|
||||
knn_topk_logits = knn_topk_logits.to("cpu")
|
||||
label = label.to("cpu")
|
||||
|
||||
|
||||
|
||||
for t in range(bsz):
|
||||
if knn_topk_id[t][0] == label[t] and knn_topk_logits[t][0] > mask_topk_logits[t][0] and mask_topk_logits[t][0] <= 0.4:
|
||||
print(knn_topk_logits[t], knn_topk_id[t])
|
||||
print(lmap(lambda x: self.id2entity[x.item()], knn_topk_id[t]))
|
||||
print(mask_topk_logits[t], mask_topk_id[t])
|
||||
print(lmap(lambda x: self.id2entity[x.item()], mask_topk_id[t]))
|
||||
print(label[t])
|
||||
print()
|
||||
|
||||
_, outputs = torch.sort(logits, dim=1, descending=True)
|
||||
_, outputs = torch.sort(outputs, dim=1)
|
||||
ranks = outputs[torch.arange(bsz), label].detach().cpu() + 1
|
||||
|
||||
|
||||
return dict(ranks = np.array(ranks), knn_topk_id=knn_topk_id, knn_topk_logits=knn_topk_logits,
|
||||
mask_topk_id=mask_topk_id, mask_topk_logits=mask_topk_logits, num_same = np.array(union_topk))
|
||||
|
||||
def test_epoch_end(self, outputs) -> None:
|
||||
|
||||
ranks = np.concatenate([_['ranks'] for _ in outputs])
|
||||
num_same = np.concatenate([_['num_same'] for _ in outputs])
|
||||
results_keys = list(outputs[0].keys())
|
||||
results = {}
|
||||
# for k in results_keys:
|
||||
# results.
|
||||
|
||||
self.log("Test/num_same", num_same.mean())
|
||||
|
||||
hits20 = (ranks<=20).mean()
|
||||
hits10 = (ranks<=10).mean()
|
||||
hits3 = (ranks<=3).mean()
|
||||
hits1 = (ranks<=1).mean()
|
||||
|
||||
|
||||
self.log("Test/hits10", hits10)
|
||||
self.log("Test/hits20", hits20)
|
||||
self.log("Test/hits3", hits3)
|
||||
self.log("Test/hits1", hits1)
|
||||
self.log("Test/mean_rank", ranks.mean())
|
||||
self.log("Test/mrr", (1. / ranks).mean())
|
||||
|
||||
def add_to_argparse(parser):
|
||||
parser = TransformerLitModel.add_to_argparse(parser)
|
||||
parser.add_argument("--knn_lambda", type=float, default=0.5, help="lambda * knn + (1-lambda) * mask logits , lambda of knn logits and mask logits.")
|
||||
parser.add_argument("--knn_topk", type=int, default=100, help="")
|
||||
|
||||
return parser
|
||||
|
||||
def combine_knn_and_vocab_probs(knn_p, vocab_p, coeff=0.5):
|
||||
combine_probs = torch.stack([vocab_p, knn_p], dim=0)
|
||||
coeffs = torch.ones_like(combine_probs)
|
||||
coeffs[0] = np.log(1 - coeff)
|
||||
coeffs[1] = np.log(coeff)
|
||||
curr_prob = torch.logsumexp(combine_probs + coeffs, dim=0)
|
||||
|
||||
return curr_prob
|
||||
|
||||
def distance2logits(D):
|
||||
return torch.softmax( -1. * torch.tensor(D) / 30., dim=-1)
|
||||
|
||||
def distance2logits_2(D, n=10):
|
||||
if not isinstance(D, torch.Tensor):
|
||||
D = torch.tensor(D)
|
||||
if torch.sum(D) != 0.0:
|
||||
distances = torch.exp(-D/n) / torch.sum(torch.exp(-D/n), dim=-1, keepdim=True)
|
||||
return distances
|
66
lit_models/utils.py
Normal file
66
lit_models/utils.py
Normal file
@ -0,0 +1,66 @@
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
def rank_score(ranks):
|
||||
# prepare the dataset
|
||||
len_samples = len(ranks)
|
||||
hits10 = [0] * len_samples
|
||||
hits5 = [0] * len_samples
|
||||
hits1 = [0] * len_samples
|
||||
mrr = []
|
||||
|
||||
|
||||
for idx, rank in enumerate(ranks):
|
||||
if rank <= 10:
|
||||
hits10[idx] = 1.
|
||||
if rank <= 5:
|
||||
hits5[idx] = 1.
|
||||
if rank <= 1:
|
||||
hits1[idx] = 1.
|
||||
mrr.append(1./rank)
|
||||
|
||||
|
||||
return np.mean(hits10), np.mean(hits5), np.mean(hits1), np.mean(mrr)
|
||||
|
||||
def acc(logits, labels):
|
||||
preds = np.argmax(logits, axis=-1)
|
||||
return (preds == labels).mean()
|
||||
import torch.nn as nn
|
||||
import torch
|
||||
class LabelSmoothSoftmaxCEV1(nn.Module):
|
||||
'''
|
||||
This is the autograd version, you can also try the LabelSmoothSoftmaxCEV2 that uses derived gradients
|
||||
'''
|
||||
|
||||
def __init__(self, lb_smooth=0.1, reduction='mean', ignore_index=-100):
|
||||
super(LabelSmoothSoftmaxCEV1, self).__init__()
|
||||
self.lb_smooth = lb_smooth
|
||||
self.reduction = reduction
|
||||
self.lb_ignore = ignore_index
|
||||
self.log_softmax = nn.LogSoftmax(dim=1)
|
||||
|
||||
def forward(self, logits, label):
|
||||
'''
|
||||
args: logits: tensor of shape (N, C, H, W)
|
||||
args: label: tensor of shape(N, H, W)
|
||||
'''
|
||||
# overcome ignored label
|
||||
with torch.no_grad():
|
||||
num_classes = logits.size(1)
|
||||
label = label.clone().detach()
|
||||
ignore = label == self.lb_ignore
|
||||
n_valid = (ignore == 0).sum()
|
||||
label[ignore] = 0
|
||||
lb_pos, lb_neg = 1. - self.lb_smooth, self.lb_smooth / num_classes
|
||||
label = torch.empty_like(logits).fill_(
|
||||
lb_neg).scatter_(1, label.unsqueeze(1), lb_pos).detach()
|
||||
|
||||
logs = self.log_softmax(logits)
|
||||
loss = -torch.sum(logs * label, dim=1)
|
||||
loss[ignore] = 0
|
||||
if self.reduction == 'mean':
|
||||
loss = loss.sum() / n_valid
|
||||
if self.reduction == 'sum':
|
||||
loss = loss.sum()
|
||||
|
||||
return loss
|
139
main.py
Normal file
139
main.py
Normal file
@ -0,0 +1,139 @@
|
||||
import argparse
|
||||
import importlib
|
||||
from logging import debug
|
||||
import numpy as np
|
||||
import torch
|
||||
import pytorch_lightning as pl
|
||||
import lit_models
|
||||
import yaml
|
||||
import time
|
||||
from transformers import AutoConfig
|
||||
import os
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
# In order to ensure reproducible experiments, we must set random seeds.
|
||||
|
||||
def _import_class(module_and_class_name: str) -> type:
|
||||
"""Import class from a module, e.g. 'text_recognizer.models.MLP'"""
|
||||
module_name, class_name = module_and_class_name.rsplit(".", 1)
|
||||
module = importlib.import_module(module_name)
|
||||
class_ = getattr(module, class_name)
|
||||
return class_
|
||||
|
||||
|
||||
def _setup_parser():
|
||||
"""Set up Python's ArgumentParser with data, model, trainer, and other arguments."""
|
||||
parser = argparse.ArgumentParser(add_help=False)
|
||||
|
||||
# Add Trainer specific arguments, such as --max_epochs, --gpus, --precision
|
||||
trainer_parser = pl.Trainer.add_argparse_args(parser)
|
||||
trainer_parser._action_groups[1].title = "Trainer Args" # pylint: disable=protected-access
|
||||
parser = argparse.ArgumentParser(add_help=False, parents=[trainer_parser])
|
||||
|
||||
# Basic arguments
|
||||
parser.add_argument("--max_triplet", type=int, default=32)
|
||||
parser.add_argument("--use_global_node", type=bool, default=False)
|
||||
parser.add_argument("--add_attn_bias", type=bool, default=True)
|
||||
# parser.add_argument("--wandb", action="store_true", default=False)
|
||||
parser.add_argument("--litmodel_class", type=str, default="TransformerLitModel")
|
||||
parser.add_argument("--seed", type=int, default=5)
|
||||
parser.add_argument("--data_class", type=str, default="KGC")
|
||||
parser.add_argument("--chunk", type=str, default="")
|
||||
parser.add_argument("--model_class", type=str, default="RobertaUseLabelWord")
|
||||
parser.add_argument("--checkpoint", type=str, default=None)
|
||||
|
||||
|
||||
|
||||
# Get the data and model classes, so that we can add their specific arguments
|
||||
temp_args, _ = parser.parse_known_args()
|
||||
data_class = _import_class(f"data.{temp_args.data_class}")
|
||||
model_class = _import_class(f"models.{temp_args.model_class}")
|
||||
lit_model_class = _import_class(f"lit_models.{temp_args.litmodel_class}")
|
||||
|
||||
print('***********************')
|
||||
print(f"data.{temp_args.data_class}")
|
||||
print(f"models.{temp_args.model_class}")
|
||||
print(f"lit_models.{temp_args.litmodel_class}")
|
||||
print('***********************')
|
||||
|
||||
# Get data, model, and LitModel specific arguments
|
||||
data_group = parser.add_argument_group("Data Args")
|
||||
data_class.add_to_argparse(data_group)
|
||||
|
||||
model_group = parser.add_argument_group("Model Args")
|
||||
if hasattr(model_class, "add_to_argparse"):
|
||||
model_class.add_to_argparse(model_group)
|
||||
|
||||
lit_model_group = parser.add_argument_group("LitModel Args")
|
||||
lit_model_class.add_to_argparse(lit_model_group)
|
||||
|
||||
parser.add_argument("--help", "-h", action="help")
|
||||
return parser
|
||||
|
||||
def _saved_pretrain(lit_model, tokenizer, path):
|
||||
lit_model.model.save_pretrained(path)
|
||||
tokenizer.save_pretrained(path)
|
||||
|
||||
|
||||
def main():
|
||||
parser = _setup_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
pl.seed_everything(args.seed)
|
||||
|
||||
data_class = _import_class(f"data.{args.data_class}")
|
||||
model_class = _import_class(f"models.{args.model_class}")
|
||||
litmodel_class = _import_class(f"lit_models.{args.litmodel_class}")
|
||||
|
||||
config = AutoConfig.from_pretrained(args.model_name_or_path)
|
||||
# update parameters
|
||||
config.label_smoothing = args.label_smoothing
|
||||
|
||||
model = model_class.from_pretrained(args.model_name_or_path, config=config)
|
||||
data = data_class(args, model)
|
||||
# print('data', data)
|
||||
tokenizer = data.tokenizer
|
||||
|
||||
lit_model = litmodel_class(args=args, model=model, tokenizer=tokenizer, data_config=data.get_config())
|
||||
if args.checkpoint:
|
||||
lit_model.load_state_dict(torch.load(args.checkpoint, map_location="cpu")["state_dict"], strict=False)
|
||||
|
||||
logger = pl.loggers.TensorBoardLogger("training/logs")
|
||||
# if args.wandb:
|
||||
# logger = pl.loggers.WandbLogger(project="kgc_bert", name=args.data_dir.split("/")[-1])
|
||||
# logger.log_hyperparams(vars(args))
|
||||
|
||||
metric_name = "Eval/mrr" if not args.pretrain else "Eval/hits1"
|
||||
|
||||
early_callback = pl.callbacks.EarlyStopping(monitor="Eval/mrr", mode="max", patience=10)
|
||||
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor=metric_name, mode="max",
|
||||
filename=args.data_dir.split("/")[-1] + '/{epoch}-{Eval/hits10:.2f}-{Eval/hits1:.2f}' if not args.pretrain else args.data_dir.split("/")[-1] + '/{epoch}-{step}-{Eval/hits10:.2f}',
|
||||
dirpath="output",
|
||||
save_weights_only=True,
|
||||
every_n_train_steps=100 if args.pretrain else None,
|
||||
# every_n_train_steps=100,
|
||||
save_top_k=5 if args.pretrain else 1
|
||||
)
|
||||
callbacks = [early_callback, model_checkpoint]
|
||||
|
||||
# args.weights_summary = "full" # Print full summary of the model
|
||||
trainer = pl.Trainer.from_argparse_args(args, callbacks=callbacks, logger=logger, default_root_dir="training/logs", accelerator="ddp")
|
||||
|
||||
if "EntityEmbedding" not in lit_model.__class__.__name__:
|
||||
trainer.fit(lit_model, datamodule=data)
|
||||
path = model_checkpoint.best_model_path
|
||||
lit_model.load_state_dict(torch.load(path)["state_dict"], strict=False)
|
||||
|
||||
result = trainer.test(lit_model, datamodule=data)
|
||||
print(result)
|
||||
|
||||
# _saved_pretrain(lit_model, tokenizer, path)
|
||||
if "EntityEmbedding" not in lit_model.__class__.__name__:
|
||||
print("*path"*30)
|
||||
print(path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
6
models/__init__.py
Normal file
6
models/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
|
||||
|
||||
from transformers import BartForConditionalGeneration, T5ForConditionalGeneration, GPT2LMHeadModel
|
||||
|
||||
from .model import *
|
||||
|
1086
models/huggingface_relformer.py
Normal file
1086
models/huggingface_relformer.py
Normal file
File diff suppressed because it is too large
Load Diff
7
models/model.py
Normal file
7
models/model.py
Normal file
@ -0,0 +1,7 @@
|
||||
# from transformers.models.bert.modeling_bert import BertForMaskedLM
|
||||
from models.huggingface_relformer import BertForMaskedLM
|
||||
class BertKGC(BertForMaskedLM):
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
parser.add_argument("--pretrain", type=int, default=0, help="")
|
||||
return parser
|
1159
models/utils.py
Normal file
1159
models/utils.py
Normal file
File diff suppressed because it is too large
Load Diff
2
pretrain/data/__init__.py
Normal file
2
pretrain/data/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from .data_module import KGC
|
||||
from .processor import convert_examples_to_features, KGProcessor
|
71
pretrain/data/base_data_module.py
Normal file
71
pretrain/data/base_data_module.py
Normal file
@ -0,0 +1,71 @@
|
||||
"""Base DataModule class."""
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import pytorch_lightning as pl
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
|
||||
class Config(dict):
|
||||
def __getattr__(self, name):
|
||||
return self.get(name)
|
||||
|
||||
def __setattr__(self, name, val):
|
||||
self[name] = val
|
||||
|
||||
|
||||
BATCH_SIZE = 8
|
||||
NUM_WORKERS = 8
|
||||
|
||||
|
||||
class BaseDataModule(pl.LightningDataModule):
|
||||
"""
|
||||
Base DataModule.
|
||||
Learn more at https://pytorch-lightning.readthedocs.io/en/stable/datamodules.html
|
||||
"""
|
||||
|
||||
def __init__(self, args: argparse.Namespace = None) -> None:
|
||||
super().__init__()
|
||||
self.args = Config(vars(args)) if args is not None else {}
|
||||
self.batch_size = self.args.get("batch_size", BATCH_SIZE)
|
||||
self.num_workers = self.args.get("num_workers", NUM_WORKERS)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
parser.add_argument(
|
||||
"--batch_size", type=int, default=BATCH_SIZE, help="Number of examples to operate on per forward step."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_workers", type=int, default=0, help="Number of additional processes to load data."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset", type=str, default="./dataset/NELL", help="Number of additional processes to load data."
|
||||
)
|
||||
return parser
|
||||
|
||||
def prepare_data(self):
|
||||
"""
|
||||
Use this method to do things that might write to disk or that need to be done only from a single GPU in distributed settings (so don't set state `self.x = y`).
|
||||
"""
|
||||
pass
|
||||
|
||||
def setup(self, stage=None):
|
||||
"""
|
||||
Split into train, val, test, and set dims.
|
||||
Should assign `torch Dataset` objects to self.data_train, self.data_val, and optionally self.data_test.
|
||||
"""
|
||||
self.data_train = None
|
||||
self.data_val = None
|
||||
self.data_test = None
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.data_train, shuffle=True, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.data_val, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.data_test, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
|
196
pretrain/data/data_module.py
Normal file
196
pretrain/data/data_module.py
Normal file
@ -0,0 +1,196 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
|
||||
from enum import Enum
|
||||
import torch
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoTokenizer, BertTokenizer
|
||||
# from transformers.configuration_bert import BertTokenizer, BertTokenizerFast
|
||||
from transformers.tokenization_utils_base import (BatchEncoding,
|
||||
PreTrainedTokenizerBase)
|
||||
|
||||
from .base_data_module import BaseDataModule
|
||||
from .processor import KGProcessor, get_dataset
|
||||
import transformers
|
||||
transformers.logging.set_verbosity_error()
|
||||
|
||||
class ExplicitEnum(Enum):
|
||||
"""
|
||||
Enum with more explicit error message for missing values.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def _missing_(cls, value):
|
||||
raise ValueError(
|
||||
f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
|
||||
)
|
||||
|
||||
class PaddingStrategy(ExplicitEnum):
|
||||
"""
|
||||
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
|
||||
in an IDE.
|
||||
"""
|
||||
|
||||
LONGEST = "longest"
|
||||
MAX_LENGTH = "max_length"
|
||||
DO_NOT_PAD = "do_not_pad"
|
||||
|
||||
import numpy as np
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForSeq2Seq:
|
||||
"""
|
||||
Data collator that will dynamically pad the inputs received, as well as the labels.
|
||||
|
||||
Args:
|
||||
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
|
||||
The tokenizer used for encoding the data.
|
||||
model (:class:`~transformers.PreTrainedModel`):
|
||||
The model that is being trained. If set and has the `prepare_decoder_input_ids_from_labels`, use it to
|
||||
prepare the `decoder_input_ids`
|
||||
|
||||
This is useful when using `label_smoothing` to avoid calculating loss twice.
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||
among:
|
||||
|
||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
||||
sequence is provided).
|
||||
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
||||
maximum acceptable input length for the model if that argument is not provided.
|
||||
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
||||
different lengths).
|
||||
max_length (:obj:`int`, `optional`):
|
||||
Maximum length of the returned list and optionally padding length (see above).
|
||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||
If set will pad the sequence to a multiple of the provided value.
|
||||
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
|
||||
7.5 (Volta).
|
||||
label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
|
||||
The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
|
||||
"""
|
||||
|
||||
tokenizer: PreTrainedTokenizerBase
|
||||
model: Optional[Any] = None
|
||||
padding: Union[bool, str, PaddingStrategy] = True
|
||||
max_length: Optional[int] = None
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
label_pad_token_id: int = -100
|
||||
return_tensors: str = "pt"
|
||||
num_labels: int = 0
|
||||
|
||||
def __call__(self, features, return_tensors=None):
|
||||
|
||||
if return_tensors is None:
|
||||
return_tensors = self.return_tensors
|
||||
labels = [feature.pop("labels") for feature in features] if "labels" in features[0].keys() else None
|
||||
label = [feature.pop("label") for feature in features]
|
||||
features_keys = {}
|
||||
name_keys = list(features[0].keys())
|
||||
for k in name_keys:
|
||||
# ignore the padding arguments
|
||||
if k in ["input_ids", "attention_mask", "token_type_ids"]: continue
|
||||
try:
|
||||
features_keys[k] = [feature.pop(k) for feature in features]
|
||||
except KeyError:
|
||||
continue
|
||||
# features_keys[k] = [feature.pop(k) for feature in features]
|
||||
|
||||
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
|
||||
# same length to return tensors.
|
||||
bsz = len(labels)
|
||||
with torch.no_grad():
|
||||
new_labels = torch.zeros(bsz, self.num_labels)
|
||||
for i,l in enumerate(labels):
|
||||
if isinstance(l, int):
|
||||
new_labels[i][l] = 1
|
||||
else:
|
||||
for j in l:
|
||||
new_labels[i][j] = 1
|
||||
labels = new_labels
|
||||
|
||||
features = self.tokenizer.pad(
|
||||
features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
features['labels'] = labels
|
||||
features['label'] = torch.tensor(label)
|
||||
features.update(features_keys)
|
||||
|
||||
return features
|
||||
|
||||
|
||||
|
||||
class KGC(BaseDataModule):
|
||||
def __init__(self, args, model) -> None:
|
||||
super().__init__(args)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.args.model_name_or_path, use_fast=False)
|
||||
self.processor = KGProcessor(self.tokenizer, args)
|
||||
self.label_list = self.processor.get_labels(args.data_dir)
|
||||
|
||||
entity_list = self.processor.get_entities(args.data_dir)
|
||||
|
||||
num_added_tokens = self.tokenizer.add_special_tokens({'additional_special_tokens': entity_list})
|
||||
self.sampler = DataCollatorForSeq2Seq(self.tokenizer,
|
||||
model=model,
|
||||
label_pad_token_id=self.tokenizer.pad_token_id,
|
||||
pad_to_multiple_of=8 if self.args.precision == 16 else None,
|
||||
padding="longest",
|
||||
max_length=self.args.max_seq_length,
|
||||
num_labels = len(entity_list),
|
||||
)
|
||||
relations_tokens = self.processor.get_relations(args.data_dir)
|
||||
self.num_relations = len(relations_tokens)
|
||||
num_added_tokens = self.tokenizer.add_special_tokens({'additional_special_tokens': relations_tokens})
|
||||
|
||||
vocab = self.tokenizer.get_added_vocab()
|
||||
self.relation_id_st = vocab[relations_tokens[0]]
|
||||
self.relation_id_ed = vocab[relations_tokens[-1]] + 1
|
||||
self.entity_id_st = vocab[entity_list[0]]
|
||||
self.entity_id_ed = vocab[entity_list[-1]] + 1
|
||||
|
||||
|
||||
def setup(self, stage=None):
|
||||
self.data_train = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "train")
|
||||
self.data_val = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "dev")
|
||||
self.data_test = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "test")
|
||||
|
||||
def prepare_data(self):
|
||||
pass
|
||||
|
||||
def get_config(self):
|
||||
d = {}
|
||||
for k, v in self.__dict__.items():
|
||||
if "st" in k or "ed" in k:
|
||||
d.update({k:v})
|
||||
|
||||
return d
|
||||
|
||||
|
||||
@staticmethod
|
||||
def add_to_argparse(parser):
|
||||
BaseDataModule.add_to_argparse(parser)
|
||||
parser.add_argument("--model_name_or_path", type=str, default="roberta-base", help="the name or the path to the pretrained model")
|
||||
parser.add_argument("--data_dir", type=str, default="roberta-base", help="the name or the path to the pretrained model")
|
||||
parser.add_argument("--max_seq_length", type=int, default=256, help="Number of examples to operate on per forward step.")
|
||||
parser.add_argument("--warm_up_radio", type=float, default=0.1, help="Number of examples to operate on per forward step.")
|
||||
parser.add_argument("--eval_batch_size", type=int, default=8)
|
||||
parser.add_argument("--overwrite_cache", action="store_true", default=False)
|
||||
return parser
|
||||
|
||||
def get_tokenizer(self):
|
||||
return self.tokenizer
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.data_train, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.batch_size, shuffle=not self.args.faiss_init)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.data_val, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.eval_batch_size)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.data_test, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.eval_batch_size)
|
||||
|
936
pretrain/data/processor.py
Normal file
936
pretrain/data/processor.py
Normal file
@ -0,0 +1,936 @@
|
||||
from re import DEBUG
|
||||
|
||||
import contextlib
|
||||
import sys
|
||||
|
||||
from collections import Counter
|
||||
from multiprocessing import Pool
|
||||
from torch._C import HOIST_CONV_PACKED_PARAMS
|
||||
from torch.utils.data import Dataset, Sampler, IterableDataset
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
from multiprocessing import Pool
|
||||
import os
|
||||
import random
|
||||
import json
|
||||
import torch
|
||||
import copy
|
||||
import numpy as np
|
||||
import pickle
|
||||
from tqdm import tqdm
|
||||
from dataclasses import dataclass, asdict, replace
|
||||
import inspect
|
||||
|
||||
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
||||
|
||||
from models.utils import get_entity_spans_pre_processing
|
||||
|
||||
def lmap(a, b):
|
||||
return list(map(a,b))
|
||||
|
||||
def cache_results(_cache_fp, _refresh=False, _verbose=1):
|
||||
r"""
|
||||
cache_results是fastNLP中用于cache数据的装饰器。通过下面的例子看一下如何使用::
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
from fastNLP import cache_results
|
||||
|
||||
@cache_results('cache.pkl')
|
||||
def process_data():
|
||||
# 一些比较耗时的工作,比如读取数据,预处理数据等,这里用time.sleep()代替耗时
|
||||
time.sleep(1)
|
||||
return np.random.randint(10, size=(5,))
|
||||
|
||||
start_time = time.time()
|
||||
print("res =",process_data())
|
||||
print(time.time() - start_time)
|
||||
|
||||
start_time = time.time()
|
||||
print("res =",process_data())
|
||||
print(time.time() - start_time)
|
||||
|
||||
# 输出内容如下,可以看到两次结果相同,且第二次几乎没有花费时间
|
||||
# Save cache to cache.pkl.
|
||||
# res = [5 4 9 1 8]
|
||||
# 1.0042750835418701
|
||||
# Read cache from cache.pkl.
|
||||
# res = [5 4 9 1 8]
|
||||
# 0.0040721893310546875
|
||||
|
||||
可以看到第二次运行的时候,只用了0.0001s左右,是由于第二次运行将直接从cache.pkl这个文件读取数据,而不会经过再次预处理::
|
||||
|
||||
# 还是以上面的例子为例,如果需要重新生成另一个cache,比如另一个数据集的内容,通过如下的方式调用即可
|
||||
process_data(_cache_fp='cache2.pkl') # 完全不影响之前的‘cache.pkl'
|
||||
|
||||
上面的_cache_fp是cache_results会识别的参数,它将从'cache2.pkl'这里缓存/读取数据,即这里的'cache2.pkl'覆盖默认的
|
||||
'cache.pkl'。如果在你的函数前面加上了@cache_results()则你的函数会增加三个参数[_cache_fp, _refresh, _verbose]。
|
||||
上面的例子即为使用_cache_fp的情况,这三个参数不会传入到你的函数中,当然你写的函数参数名也不可能包含这三个名称::
|
||||
|
||||
process_data(_cache_fp='cache2.pkl', _refresh=True) # 这里强制重新生成一份对预处理的cache。
|
||||
# _verbose是用于控制输出信息的,如果为0,则不输出任何内容;如果为1,则会提醒当前步骤是读取的cache还是生成了新的cache
|
||||
|
||||
:param str _cache_fp: 将返回结果缓存到什么位置;或从什么位置读取缓存。如果为None,cache_results没有任何效用,除非在
|
||||
函数调用的时候传入_cache_fp这个参数。
|
||||
:param bool _refresh: 是否重新生成cache。
|
||||
:param int _verbose: 是否打印cache的信息。
|
||||
:return:
|
||||
"""
|
||||
|
||||
def wrapper_(func):
|
||||
signature = inspect.signature(func)
|
||||
for key, _ in signature.parameters.items():
|
||||
if key in ('_cache_fp', '_refresh', '_verbose'):
|
||||
raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key))
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
my_args = args[0]
|
||||
mode = args[-1]
|
||||
if '_cache_fp' in kwargs:
|
||||
cache_filepath = kwargs.pop('_cache_fp')
|
||||
assert isinstance(cache_filepath, str), "_cache_fp can only be str."
|
||||
else:
|
||||
cache_filepath = _cache_fp
|
||||
if '_refresh' in kwargs:
|
||||
refresh = kwargs.pop('_refresh')
|
||||
assert isinstance(refresh, bool), "_refresh can only be bool."
|
||||
else:
|
||||
refresh = _refresh
|
||||
if '_verbose' in kwargs:
|
||||
verbose = kwargs.pop('_verbose')
|
||||
assert isinstance(verbose, int), "_verbose can only be integer."
|
||||
else:
|
||||
verbose = _verbose
|
||||
refresh_flag = True
|
||||
|
||||
model_name = my_args.model_name_or_path.split("/")[-1]
|
||||
is_pretrain = my_args.pretrain
|
||||
cache_filepath = os.path.join(my_args.data_dir, f"cached_{mode}_features{model_name}_pretrain{is_pretrain}_faiss{my_args.faiss_init}_seqlength{my_args.max_seq_length}_{my_args.litmodel_class}.pkl")
|
||||
refresh = my_args.overwrite_cache
|
||||
|
||||
if cache_filepath is not None and refresh is False:
|
||||
# load data
|
||||
if os.path.exists(cache_filepath):
|
||||
with open(cache_filepath, 'rb') as f:
|
||||
results = pickle.load(f)
|
||||
if verbose == 1:
|
||||
logger.info("Read cache from {}.".format(cache_filepath))
|
||||
refresh_flag = False
|
||||
|
||||
if refresh_flag:
|
||||
results = func(*args, **kwargs)
|
||||
if cache_filepath is not None:
|
||||
if results is None:
|
||||
raise RuntimeError("The return value is None. Delete the decorator.")
|
||||
with open(cache_filepath, 'wb') as f:
|
||||
pickle.dump(results, f)
|
||||
logger.info("Save cache to {}.".format(cache_filepath))
|
||||
|
||||
return results
|
||||
|
||||
return wrapper
|
||||
|
||||
return wrapper_
|
||||
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
# from torch.nn import CrossEntropyLoss, MSELoss
|
||||
# from scipy.stats import pearsonr, spearmanr
|
||||
# from sklearn.metrics import matthews_corrcoef, f1_scoreclass
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class InputExample(object):
|
||||
"""A single training/test example for simple sequence classification."""
|
||||
|
||||
def __init__(self, guid, text_a, text_b=None, text_c=None, label=None, real_label=None, en=None, rel=None):
|
||||
"""Constructs a InputExample.
|
||||
|
||||
Args:
|
||||
guid: Unique id for the example.
|
||||
text_a: string. The untokenized text of the first sequence. For single
|
||||
sequence tasks, only this sequence must be specified.
|
||||
text_b: (Optional) string. The untokenized text of the second sequence.
|
||||
Only must be specified for sequence pair tasks.
|
||||
text_c: (Optional) string. The untokenized text of the third sequence.
|
||||
Only must be specified for sequence triple tasks.
|
||||
label: (Optional) string. list of entities
|
||||
"""
|
||||
self.guid = guid
|
||||
self.text_a = text_a
|
||||
self.text_b = text_b
|
||||
self.text_c = text_c
|
||||
self.label = label
|
||||
self.real_label = real_label
|
||||
self.en = en
|
||||
self.rel = rel # rel id
|
||||
|
||||
|
||||
@dataclass
|
||||
class InputFeatures:
|
||||
"""A single set of features of data."""
|
||||
|
||||
input_ids: torch.Tensor
|
||||
attention_mask: torch.Tensor
|
||||
labels: torch.Tensor = None
|
||||
label: torch.Tensor = None
|
||||
en: torch.Tensor = 0
|
||||
rel: torch.Tensor = 0
|
||||
pos: torch.Tensor = 0
|
||||
|
||||
|
||||
class DataProcessor(object):
|
||||
"""Base class for data converters for sequence classification data sets."""
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""Gets a collection of `InputExample`s for the train set."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""Gets a collection of `InputExample`s for the dev set."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_labels(self, data_dir):
|
||||
"""Gets the list of labels for this data set."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@classmethod
|
||||
def _read_tsv(cls, input_file, quotechar=None):
|
||||
"""Reads a tab separated value file."""
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
import copy
|
||||
|
||||
|
||||
def solve_get_knowledge_store(line, set_type="train", pretrain=1):
|
||||
"""
|
||||
use the LM to get the entity embedding.
|
||||
Transductive: triples + text description
|
||||
Inductive: text description
|
||||
|
||||
"""
|
||||
examples = []
|
||||
|
||||
head_ent_text = ent2text[line[0]]
|
||||
tail_ent_text = ent2text[line[2]]
|
||||
relation_text = rel2text[line[1]]
|
||||
|
||||
i=0
|
||||
|
||||
a = tail_filter_entities["\t".join([line[0],line[1]])]
|
||||
b = head_filter_entities["\t".join([line[2],line[1]])]
|
||||
|
||||
guid = "%s-%s" % (set_type, i)
|
||||
text_a = head_ent_text
|
||||
text_b = relation_text
|
||||
text_c = tail_ent_text
|
||||
|
||||
# use the description of c to predict A
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[PAD]", text_b=text_b + "[PAD]", text_c = "[PAD]" + " " + text_c, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[ent2id[line[0]], rel2id[line[1]], ent2id[line[2]]], rel=0)
|
||||
)
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[PAD]", text_b=text_b + "[PAD]", text_c = "[PAD]" + " " + text_a, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[2]], en=[ent2id[line[0]], rel2id[line[1]], ent2id[line[2]]], rel=0)
|
||||
)
|
||||
return examples
|
||||
|
||||
|
||||
def solve(line, set_type="train", pretrain=1):
|
||||
examples = []
|
||||
|
||||
head_ent_text = ent2text[line[0]]
|
||||
tail_ent_text = ent2text[line[2]]
|
||||
relation_text = rel2text[line[1]]
|
||||
|
||||
i=0
|
||||
|
||||
a = tail_filter_entities["\t".join([line[0],line[1]])]
|
||||
b = head_filter_entities["\t".join([line[2],line[1]])]
|
||||
|
||||
guid = "%s-%s" % (set_type, i)
|
||||
text_a = head_ent_text
|
||||
text_b = relation_text
|
||||
text_c = tail_ent_text
|
||||
|
||||
|
||||
if pretrain:
|
||||
text_a_tokens = text_a.split()
|
||||
for i in range(10):
|
||||
st = random.randint(0, len(text_a_tokens))
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[MASK]", text_b=" ".join(text_a_tokens[st:min(st+64, len(text_a_tokens))]), text_c = "", label=ent2id[line[0]], real_label=ent2id[line[0]], en=0, rel=0)
|
||||
)
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[MASK]", text_b=text_a, text_c = "", label=ent2id[line[0]], real_label=ent2id[line[0]], en=0, rel=0)
|
||||
)
|
||||
# examples.append(
|
||||
# InputExample(guid=guid, text_a="[MASK]", text_b=text_c, text_c = "", label=ent2id[line[2]], real_label=ent2id[line[2]], en=0, rel=0)
|
||||
# )
|
||||
else:
|
||||
|
||||
# examples.append(
|
||||
# InputExample(guid=guid, text_a="[MASK]", text_b=text_b + "[PAD]", text_c = "[UNK]" , label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=ent2id[line[2]], rel=rel2id[line[1]]))
|
||||
# examples.append(
|
||||
# InputExample(guid=guid, text_a="[UNK] ", text_b=text_b + "[PAD]", text_c = "[MASK]", label=lmap(lambda x: ent2id[x], a), real_label=ent2id[line[2]], en=ent2id[line[0]], rel=rel2id[line[1]]))
|
||||
|
||||
# examples.append(
|
||||
# InputExample(guid=guid, text_a="[UNK]" + " " + text_c, text_b=text_b + "[PAD]", text_c = "[MASK]", label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=ent2id[line[2]], rel=rel2id[line[1]]))
|
||||
# examples.append(
|
||||
# InputExample(guid=guid, text_a="[MASK]", text_b=text_b + "[PAD]", text_c = "[UNK]" + text_a, label=lmap(lambda x: ent2id[x], a), real_label=ent2id[line[2]], en=ent2id[line[0]], rel=rel2id[line[1]]))
|
||||
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[MASK]", text_b=text_b + "[PAD]", text_c = "[PAD]" + " " + text_c, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[rel2id[line[1]], ent2id[line[2]]], rel=rel2id[line[1]]))
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a="[PAD] ", text_b=text_b + "[PAD]", text_c = "[MASK]" +" " + text_a, label=lmap(lambda x: ent2id[x], a), real_label=ent2id[line[2]], en=[ent2id[line[0]], rel2id[line[1]]], rel=rel2id[line[1]]))
|
||||
return examples
|
||||
|
||||
def filter_init(head, tail, t1,t2, ent2id_, ent2token_, rel2id_):
|
||||
global head_filter_entities
|
||||
global tail_filter_entities
|
||||
global ent2text
|
||||
global rel2text
|
||||
global ent2id
|
||||
global ent2token
|
||||
global rel2id
|
||||
|
||||
head_filter_entities = head
|
||||
tail_filter_entities = tail
|
||||
ent2text =t1
|
||||
rel2text =t2
|
||||
ent2id = ent2id_
|
||||
ent2token = ent2token_
|
||||
rel2id = rel2id_
|
||||
|
||||
def delete_init(ent2text_):
|
||||
global ent2text
|
||||
ent2text = ent2text_
|
||||
|
||||
|
||||
class KGProcessor(DataProcessor):
|
||||
"""Processor for knowledge graph data set."""
|
||||
def __init__(self, tokenizer, args):
|
||||
self.labels = set()
|
||||
self.tokenizer = tokenizer
|
||||
self.args = args
|
||||
self.entity_path = os.path.join(args.data_dir, "entity2textlong.txt") if os.path.exists(os.path.join(args.data_dir, 'entity2textlong.txt')) \
|
||||
else os.path.join(args.data_dir, "entity2text.txt")
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
return self._create_examples(
|
||||
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", data_dir, self.args)
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
return self._create_examples(
|
||||
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", data_dir, self.args)
|
||||
|
||||
def get_test_examples(self, data_dir, chunk=""):
|
||||
"""See base class."""
|
||||
return self._create_examples(
|
||||
self._read_tsv(os.path.join(data_dir, f"test{chunk}.tsv")), "test", data_dir, self.args)
|
||||
|
||||
def get_relations(self, data_dir):
|
||||
"""Gets all labels (relations) in the knowledge graph."""
|
||||
# return list(self.labels)
|
||||
with open(os.path.join(data_dir, "relations.txt"), 'r') as f:
|
||||
lines = f.readlines()
|
||||
relations = []
|
||||
for line in lines:
|
||||
relations.append(line.strip().split('\t')[0])
|
||||
rel2token = {ent : f"[RELATION_{i}]" for i, ent in enumerate(relations)}
|
||||
return list(rel2token.values())
|
||||
|
||||
def get_labels(self, data_dir):
|
||||
"""Gets all labels (0, 1) for triples in the knowledge graph."""
|
||||
relation = []
|
||||
with open(os.path.join(data_dir, "relation2text.txt"), 'r') as f:
|
||||
lines = f.readlines()
|
||||
entities = []
|
||||
for line in lines:
|
||||
relation.append(line.strip().split("\t")[-1])
|
||||
return relation
|
||||
|
||||
def get_entities(self, data_dir):
|
||||
"""Gets all entities in the knowledge graph."""
|
||||
with open(self.entity_path, 'r') as f:
|
||||
lines = f.readlines()
|
||||
entities = []
|
||||
for line in lines:
|
||||
entities.append(line.strip().split("\t")[0])
|
||||
|
||||
ent2token = {ent : f"[ENTITY_{i}]" for i, ent in enumerate(entities)}
|
||||
return list(ent2token.values())
|
||||
|
||||
def get_train_triples(self, data_dir):
|
||||
"""Gets training triples."""
|
||||
return self._read_tsv(os.path.join(data_dir, "train.tsv"))
|
||||
|
||||
def get_dev_triples(self, data_dir):
|
||||
"""Gets validation triples."""
|
||||
return self._read_tsv(os.path.join(data_dir, "dev.tsv"))
|
||||
|
||||
def get_test_triples(self, data_dir, chunk=""):
|
||||
"""Gets test triples."""
|
||||
return self._read_tsv(os.path.join(data_dir, f"test{chunk}.tsv"))
|
||||
|
||||
def _create_examples(self, lines, set_type, data_dir, args):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
# entity to text
|
||||
ent2text = {}
|
||||
ent2text_with_type = {}
|
||||
with open(self.entity_path, 'r') as f:
|
||||
ent_lines = f.readlines()
|
||||
for line in ent_lines:
|
||||
temp = line.strip().split('\t')
|
||||
try:
|
||||
end = temp[1]#.find(',')
|
||||
if "wiki" in data_dir:
|
||||
assert "Q" in temp[0]
|
||||
ent2text[temp[0]] = temp[1].replace("\\n", " ").replace("\\", "") #[:end]
|
||||
except IndexError:
|
||||
# continue
|
||||
end = " "#.find(',')
|
||||
if "wiki" in data_dir:
|
||||
assert "Q" in temp[0]
|
||||
ent2text[temp[0]] = end #[:end]
|
||||
|
||||
entities = list(ent2text.keys())
|
||||
ent2token = {ent : f"[ENTITY_{i}]" for i, ent in enumerate(entities)}
|
||||
ent2id = {ent : i for i, ent in enumerate(entities)}
|
||||
|
||||
rel2text = {}
|
||||
with open(os.path.join(data_dir, "relation2text.txt"), 'r') as f:
|
||||
rel_lines = f.readlines()
|
||||
for line in rel_lines:
|
||||
temp = line.strip().split('\t')
|
||||
rel2text[temp[0]] = temp[1]
|
||||
relation_names = {}
|
||||
with open(os.path.join(data_dir, "relations.txt"), "r") as file:
|
||||
for line in file.readlines():
|
||||
t = line.strip()
|
||||
relation_names[t] = rel2text[t]
|
||||
|
||||
tmp_lines = []
|
||||
not_in_text = 0
|
||||
for line in tqdm(lines, desc="delete entities without text name."):
|
||||
if (line[0] not in ent2text) or (line[2] not in ent2text) or (line[1] not in rel2text):
|
||||
not_in_text += 1
|
||||
continue
|
||||
tmp_lines.append(line)
|
||||
lines = tmp_lines
|
||||
print(f"total entity not in text : {not_in_text} ")
|
||||
|
||||
# rel id -> relation token id
|
||||
num_entities = len(self.get_entities(args.data_dir))
|
||||
rel2id = {w:i+num_entities for i,w in enumerate(relation_names.keys())}
|
||||
|
||||
# add reverse relation
|
||||
# tmp_rel2id = {}
|
||||
# num_relations = len(rel2id)
|
||||
# cnt = 0
|
||||
# for k, v in rel2id.items():
|
||||
# tmp_rel2id[k + " (reverse)"] = num_relations + cnt
|
||||
# cnt += 1
|
||||
# rel2id.update(tmp_rel2id)
|
||||
|
||||
examples = []
|
||||
# head filter head entity
|
||||
head_filter_entities = defaultdict(list)
|
||||
tail_filter_entities = defaultdict(list)
|
||||
|
||||
dataset_list = ["train.tsv", "dev.tsv", "test.tsv"]
|
||||
# in training, only use the train triples
|
||||
if set_type == "train" and not args.pretrain: dataset_list = dataset_list[0:1]
|
||||
for m in dataset_list:
|
||||
with open(os.path.join(data_dir, m), 'r') as file:
|
||||
train_lines = file.readlines()
|
||||
for idx in range(len(train_lines)):
|
||||
train_lines[idx] = train_lines[idx].strip().split("\t")
|
||||
|
||||
for line in train_lines:
|
||||
tail_filter_entities["\t".join([line[0], line[1]])].append(line[2])
|
||||
head_filter_entities["\t".join([line[2], line[1]])].append(line[0])
|
||||
|
||||
|
||||
|
||||
max_head_entities = max(len(_) for _ in head_filter_entities.values())
|
||||
max_tail_entities = max(len(_) for _ in tail_filter_entities.values())
|
||||
|
||||
|
||||
# use bce loss, ignore the mlm
|
||||
if set_type == "train" and args.bce:
|
||||
lines = []
|
||||
for k, v in tail_filter_entities.items():
|
||||
h, r = k.split('\t')
|
||||
t = v[0]
|
||||
lines.append([h, r, t])
|
||||
for k, v in head_filter_entities.items():
|
||||
t, r = k.split('\t')
|
||||
h = v[0]
|
||||
lines.append([h, r, t])
|
||||
|
||||
|
||||
# for training , select each entity as for get mask embedding.
|
||||
if args.pretrain:
|
||||
rel = list(rel2text.keys())[0]
|
||||
lines = []
|
||||
for k in ent2text.keys():
|
||||
lines.append([k, rel, k])
|
||||
|
||||
print(f"max number of filter entities : {max_head_entities} {max_tail_entities}")
|
||||
|
||||
from os import cpu_count
|
||||
threads = min(1, cpu_count())
|
||||
filter_init(head_filter_entities, tail_filter_entities,ent2text, rel2text, ent2id, ent2token, rel2id
|
||||
)
|
||||
|
||||
if hasattr(args, "faiss_init") and args.faiss_init:
|
||||
annotate_ = partial(
|
||||
solve_get_knowledge_store,
|
||||
pretrain=self.args.pretrain
|
||||
)
|
||||
else:
|
||||
annotate_ = partial(
|
||||
solve,
|
||||
pretrain=self.args.pretrain
|
||||
)
|
||||
examples = list(
|
||||
tqdm(
|
||||
map(annotate_, lines),
|
||||
total=len(lines),
|
||||
desc="convert text to examples"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# with Pool(threads, initializer=filter_init, initargs=(head_filter_entities, tail_filter_entities,ent2text, rel2text,
|
||||
# ent2text_with_type, rel2id,)) as pool:
|
||||
# annotate_ = partial(
|
||||
# solve,
|
||||
# )
|
||||
# examples = list(
|
||||
# tqdm(
|
||||
# map(annotate_, lines, chunksize= 128),
|
||||
# total=len(lines),
|
||||
# desc="convert text to examples"
|
||||
# )
|
||||
# )
|
||||
tmp_examples = []
|
||||
for e in examples:
|
||||
for ee in e:
|
||||
tmp_examples.append(ee)
|
||||
examples = tmp_examples
|
||||
# delete vars
|
||||
del head_filter_entities, tail_filter_entities, ent2text, rel2text, ent2id, ent2token, rel2id
|
||||
return examples
|
||||
|
||||
class Verbalizer(object):
|
||||
def __init__(self, args):
|
||||
if "WN18RR" in args.data_dir:
|
||||
self.mode = "WN18RR"
|
||||
elif "FB15k" in args.data_dir:
|
||||
self.mode = "FB15k"
|
||||
elif "umls" in args.data_dir:
|
||||
self.mode = "umls"
|
||||
elif "codexs" in args.data_dir:
|
||||
self.mode = "codexs"
|
||||
elif "codexl" in args.data_dir:
|
||||
self.mode = "codexl"
|
||||
elif "FB13" in args.data_dir:
|
||||
self.mode = "FB13"
|
||||
elif "WN11" in args.data_dir:
|
||||
self.mode = "WN11"
|
||||
|
||||
|
||||
def _convert(self, head, relation, tail):
|
||||
if self.mode == "umls":
|
||||
return f"The {relation} {head} is "
|
||||
|
||||
return f"{head} {relation}"
|
||||
|
||||
|
||||
class KGCDataset(Dataset):
|
||||
def __init__(self, features):
|
||||
self.features = features
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.features[index]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.features)
|
||||
|
||||
def convert_examples_to_features_init(tokenizer_for_convert):
|
||||
global tokenizer
|
||||
tokenizer = tokenizer_for_convert
|
||||
|
||||
def convert_examples_to_features(example, max_seq_length, mode, pretrain=1):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
# tokens_a = tokenizer.tokenize(example.text_a)
|
||||
# tokens_b = tokenizer.tokenize(example.text_b)
|
||||
# tokens_c = tokenizer.tokenize(example.text_c)
|
||||
|
||||
# _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length= max_seq_length)
|
||||
text_a = " ".join(example.text_a.split()[:128])
|
||||
text_b = " ".join(example.text_b.split()[:128])
|
||||
text_c = " ".join(example.text_c.split()[:128])
|
||||
|
||||
if pretrain:
|
||||
input_text_a = text_a
|
||||
input_text_b = text_b
|
||||
else:
|
||||
input_text_a = tokenizer.sep_token.join([text_a, text_b])
|
||||
input_text_b = text_c
|
||||
|
||||
|
||||
inputs = tokenizer(
|
||||
input_text_a,
|
||||
input_text_b,
|
||||
truncation="longest_first",
|
||||
max_length=max_seq_length,
|
||||
padding="longest",
|
||||
add_special_tokens=True,
|
||||
)
|
||||
# assert tokenizer.mask_token_id in inputs.input_ids, "mask token must in input"
|
||||
|
||||
features = asdict(InputFeatures(input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs['attention_mask'],
|
||||
labels=torch.tensor(example.label),
|
||||
label=torch.tensor(example.real_label)
|
||||
)
|
||||
)
|
||||
return features
|
||||
|
||||
|
||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
||||
"""Truncates a sequence pair in place to the maximum length."""
|
||||
|
||||
# This is a simple heuristic which will always truncate the longer sequence
|
||||
# one token at a time. This makes more sense than truncating an equal percent
|
||||
# of tokens from each, since if one sequence is very short then each token
|
||||
# that's truncated likely contains more information than a longer sequence.
|
||||
while True:
|
||||
total_length = len(tokens_a) + len(tokens_b)
|
||||
if total_length <= max_length:
|
||||
break
|
||||
if len(tokens_a) > len(tokens_b):
|
||||
tokens_a.pop()
|
||||
else:
|
||||
tokens_b.pop()
|
||||
|
||||
def _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length):
|
||||
"""Truncates a sequence triple in place to the maximum length."""
|
||||
|
||||
# This is a simple heuristic which will always truncate the longer sequence
|
||||
# one token at a time. This makes more sense than truncating an equal percent
|
||||
# of tokens from each, since if one sequence is very short then each token
|
||||
# that's truncated likely contains more information than a longer sequence.
|
||||
while True:
|
||||
total_length = len(tokens_a) + len(tokens_b) + len(tokens_c)
|
||||
if total_length <= max_length:
|
||||
break
|
||||
if len(tokens_a) > len(tokens_b) and len(tokens_a) > len(tokens_c):
|
||||
tokens_a.pop()
|
||||
elif len(tokens_b) > len(tokens_a) and len(tokens_b) > len(tokens_c):
|
||||
tokens_b.pop()
|
||||
elif len(tokens_c) > len(tokens_a) and len(tokens_c) > len(tokens_b):
|
||||
tokens_c.pop()
|
||||
else:
|
||||
tokens_c.pop()
|
||||
|
||||
|
||||
@cache_results(_cache_fp="./dataset")
|
||||
def get_dataset(args, processor, label_list, tokenizer, mode):
|
||||
|
||||
assert mode in ["train", "dev", "test"], "mode must be in train dev test!"
|
||||
|
||||
# use training data to construct the entity embedding
|
||||
combine_train_and_test = False
|
||||
if args.faiss_init and mode == "test" and not args.pretrain:
|
||||
mode = "train"
|
||||
if "ind" in args.data_dir: combine_train_and_test = True
|
||||
else:
|
||||
pass
|
||||
|
||||
if mode == "train":
|
||||
train_examples = processor.get_train_examples(args.data_dir)
|
||||
elif mode == "dev":
|
||||
train_examples = processor.get_dev_examples(args.data_dir)
|
||||
else:
|
||||
train_examples = processor.get_test_examples(args.data_dir)
|
||||
|
||||
if combine_train_and_test:
|
||||
logger.info("use all the dataset for getting the entity mask embedding in pretraining pretraining")
|
||||
logger.info("use all the dataset for getting the entity mask embedding in pretraining pretraining")
|
||||
train_examples = processor.get_test_examples(args.data_dir) + processor.get_train_examples(args.data_dir) + processor.get_dev_examples(args.data_dir)
|
||||
|
||||
from os import cpu_count
|
||||
with open(os.path.join(args.data_dir, f"examples_{mode}.txt"), 'w') as file:
|
||||
for line in train_examples:
|
||||
d = {}
|
||||
d.update(line.__dict__)
|
||||
file.write(json.dumps(d) + '\n')
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=False)
|
||||
|
||||
features = []
|
||||
# # with open(os.path.join(args.data_dir, "cached_relation_pattern.pkl"), "rb") as file:
|
||||
# # pattern = pickle.load(file)
|
||||
# pattern = None
|
||||
# convert_examples_to_features_init(tokenizer)
|
||||
# annotate_ = partial(
|
||||
# convert_examples_to_features,
|
||||
# max_seq_length=args.max_seq_length,
|
||||
# mode = mode,
|
||||
# pretrain = args.pretrain
|
||||
# )
|
||||
# features = list(
|
||||
# tqdm(
|
||||
# map(annotate_, train_examples),
|
||||
# total=len(train_examples)
|
||||
# )
|
||||
# )
|
||||
# encoder = MultiprocessingEncoder(tokenizer, args)
|
||||
# encoder.initializer()
|
||||
# for t in tqdm(train_examples):
|
||||
# features.append(encoder.encode_lines([json.dumps(t.__dict__)]))
|
||||
|
||||
# for example in tqdm(train_examples):
|
||||
# text_a = example.text_a
|
||||
# text_b = example.text_b
|
||||
# text_c = example.text_c
|
||||
|
||||
# bpe = tokenizer
|
||||
# if 0:
|
||||
# input_text_a = text_a
|
||||
# input_text_b = text_b
|
||||
# else:
|
||||
# if text_a == "[MASK]":
|
||||
# input_text_a = bpe.sep_token.join([text_a, text_b])
|
||||
# input_text_b = text_c
|
||||
# else:
|
||||
# input_text_a = text_a
|
||||
# input_text_b = bpe.sep_token.join([text_b, text_c])
|
||||
|
||||
|
||||
# inputs = tokenizer(
|
||||
# input_text_a,
|
||||
# # input_text_b,
|
||||
# truncation="longest_first",
|
||||
# max_length=128,
|
||||
# padding="longest",
|
||||
# add_special_tokens=True,
|
||||
# )
|
||||
# # assert tokenizer.mask_token_id in inputs.input_ids, "mask token must in input"
|
||||
|
||||
# # features.append(asdict(InputFeatures(input_ids=inputs["input_ids"],
|
||||
# # attention_mask=inputs['attention_mask'],
|
||||
# # labels=example.label,
|
||||
# # label=example.real_label
|
||||
# # )
|
||||
# # ))
|
||||
|
||||
|
||||
file_inputs = [os.path.join(args.data_dir, f"examples_{mode}.txt")]
|
||||
file_outputs = [os.path.join(args.data_dir, f"features_{mode}.txt")]
|
||||
|
||||
with contextlib.ExitStack() as stack:
|
||||
inputs = [
|
||||
stack.enter_context(open(input, "r", encoding="utf-8"))
|
||||
if input != "-" else sys.stdin
|
||||
for input in file_inputs
|
||||
]
|
||||
outputs = [
|
||||
stack.enter_context(open(output, "w", encoding="utf-8"))
|
||||
if output != "-" else sys.stdout
|
||||
for output in file_outputs
|
||||
]
|
||||
|
||||
encoder = MultiprocessingEncoder(tokenizer, args)
|
||||
pool = Pool(16, initializer=encoder.initializer)
|
||||
encoder.initializer()
|
||||
encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 1000)
|
||||
# encoded_lines = map(encoder.encode_lines, zip(*inputs))
|
||||
|
||||
stats = Counter()
|
||||
for i, (filt, enc_lines) in tqdm(enumerate(encoded_lines, start=1), total=len(train_examples)):
|
||||
if filt == "PASS":
|
||||
for enc_line, output_h in zip(enc_lines, outputs):
|
||||
features.append(eval(enc_line))
|
||||
# features.append(enc_line)
|
||||
# print(enc_line, file=output_h)
|
||||
else:
|
||||
stats["num_filtered_" + filt] += 1
|
||||
# if i % 10000 == 0:
|
||||
# print("processed {} lines".format(i), file=sys.stderr)
|
||||
|
||||
for k, v in stats.most_common():
|
||||
print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
|
||||
# threads = min(16, cpu_count())
|
||||
# with Pool(threads, initializer=convert_examples_to_features_init, initargs=(tokenizer,)) as pool:
|
||||
# annotate_ = partial(
|
||||
# convert_examples_to_features,
|
||||
# max_seq_length=args.max_seq_length,
|
||||
# mode = mode,
|
||||
# pretrain = args.pretrain
|
||||
# )
|
||||
# features = list(
|
||||
# tqdm(
|
||||
# pool.imap_unordered(annotate_, train_examples),
|
||||
# total=len(train_examples),
|
||||
# desc="convert examples to features",
|
||||
# )
|
||||
# )
|
||||
|
||||
# num_entities = len(processor.get_entities(args.data_dir))
|
||||
for f_id, f in enumerate(features):
|
||||
en = features[f_id].pop("en")
|
||||
rel = features[f_id].pop("rel")
|
||||
real_label = f['label']
|
||||
cnt = 0
|
||||
if not isinstance(en, list): break
|
||||
|
||||
pos = 0
|
||||
for i,t in enumerate(f['input_ids']):
|
||||
if t == tokenizer.pad_token_id:
|
||||
features[f_id]['input_ids'][i] = en[cnt] + len(tokenizer)
|
||||
cnt += 1
|
||||
if features[f_id]['input_ids'][i] == real_label + len(tokenizer):
|
||||
pos = i
|
||||
if cnt == len(en): break
|
||||
assert not (args.faiss_init and pos == 0)
|
||||
features[f_id]['pos'] = pos
|
||||
|
||||
|
||||
# for i,t in enumerate(f['input_ids']):
|
||||
# if t == tokenizer.pad_token_id:
|
||||
# features[f_id]['input_ids'][i] = rel + len(tokenizer) + num_entities
|
||||
# break
|
||||
|
||||
|
||||
|
||||
features = KGCDataset(features)
|
||||
return features
|
||||
|
||||
|
||||
class MultiprocessingEncoder(object):
|
||||
def __init__(self, tokenizer, args):
|
||||
self.tokenizer = tokenizer
|
||||
self.pretrain = args.pretrain
|
||||
self.max_seq_length = args.max_seq_length
|
||||
|
||||
def initializer(self):
|
||||
global bpe
|
||||
bpe = self.tokenizer
|
||||
|
||||
def encode(self, line):
|
||||
global bpe
|
||||
ids = bpe.encode(line)
|
||||
return list(map(str, ids))
|
||||
|
||||
def decode(self, tokens):
|
||||
global bpe
|
||||
return bpe.decode(tokens)
|
||||
|
||||
def encode_lines(self, lines):
|
||||
"""
|
||||
Encode a set of lines. All lines will be encoded together.
|
||||
"""
|
||||
enc_lines = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if len(line) == 0:
|
||||
return ["EMPTY", None]
|
||||
# enc_lines.append(" ".join(tokens))
|
||||
enc_lines.append(json.dumps(self.convert_examples_to_features(example=eval(line))))
|
||||
# enc_lines.append(" ")
|
||||
# enc_lines.append("123")
|
||||
return ["PASS", enc_lines]
|
||||
|
||||
def decode_lines(self, lines):
|
||||
dec_lines = []
|
||||
for line in lines:
|
||||
tokens = map(int, line.strip().split())
|
||||
dec_lines.append(self.decode(tokens))
|
||||
return ["PASS", dec_lines]
|
||||
|
||||
def convert_examples_to_features(self, example):
|
||||
pretrain = self.pretrain
|
||||
max_seq_length = self.max_seq_length
|
||||
global bpe
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
# tokens_a = tokenizer.tokenize(example.text_a)
|
||||
# tokens_b = tokenizer.tokenize(example.text_b)
|
||||
# tokens_c = tokenizer.tokenize(example.text_c)
|
||||
|
||||
# _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length= max_seq_length)
|
||||
# text_a = " ".join(example['text_a'].split()[:128])
|
||||
# text_b = " ".join(example['text_b'].split()[:128])
|
||||
# text_c = " ".join(example['text_c'].split()[:128])
|
||||
|
||||
text_a = example['text_a']
|
||||
text_b = example['text_b']
|
||||
text_c = example['text_c']
|
||||
|
||||
if pretrain:
|
||||
# the des of xxx is [MASK] .
|
||||
input_text = f"The description of {text_a} is that {text_b} ."
|
||||
inputs = bpe(
|
||||
input_text,
|
||||
truncation="longest_first",
|
||||
max_length=max_seq_length,
|
||||
padding="longest",
|
||||
add_special_tokens=True,
|
||||
)
|
||||
else:
|
||||
if text_a == "[MASK]":
|
||||
input_text_a = bpe.sep_token.join([text_a, text_b])
|
||||
input_text_b = text_c
|
||||
else:
|
||||
input_text_a = text_a
|
||||
input_text_b = bpe.sep_token.join([text_b, text_c])
|
||||
|
||||
|
||||
inputs = bpe(
|
||||
input_text_a,
|
||||
input_text_b,
|
||||
truncation="longest_first",
|
||||
max_length=max_seq_length,
|
||||
padding="longest",
|
||||
add_special_tokens=True,
|
||||
)
|
||||
# assert bpe.mask_token_id in inputs.input_ids, "mask token must in input"
|
||||
|
||||
features = asdict(InputFeatures(input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs['attention_mask'],
|
||||
labels=example['label'],
|
||||
label=example['real_label'],
|
||||
en=example['en'],
|
||||
rel=example['rel']
|
||||
)
|
||||
)
|
||||
return features
|
||||
if __name__ == "__main__":
|
||||
dataset = KGCDataset('./dataset')
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user