MIT License
Copyright (c) 2021 ZJUNLP
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

@ -1,6 +0,0 @@
from transformers import BartForConditionalGeneration, T5ForConditionalGeneration, GPT2LMHeadModel
from .model import *

@ -1,6 +0,0 @@
"#examples": 3994,
"#kept_examples": 3994,
"#mappable_examples": 743,
"#multiple_answer_examples": 2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,6 +0,0 @@
"#examples": 3996,
"#kept_examples": 3996,
"#mappable_examples": 755,
"#multiple_answer_examples": 0

File diff suppressed because it is too large Load Diff

@ -1,6 +0,0 @@
"#examples": 20358,
"#kept_examples": 20358,
"#mappable_examples": 3713,
"#multiple_answer_examples": 4

File diff suppressed because it is too large Load Diff

@ -1,6 +0,0 @@
"#examples": 3994,
"#kept_examples": 3994,
"#mappable_examples": 743,
"#multiple_answer_examples": 2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,6 +0,0 @@
"#examples": 3996,
"#kept_examples": 3996,
"#mappable_examples": 755,
"#multiple_answer_examples": 0

File diff suppressed because it is too large Load Diff

@ -1,6 +0,0 @@
"#examples": 20358,
"#kept_examples": 20358,
"#mappable_examples": 3713,
"#multiple_answer_examples": 4

File diff suppressed because it is too large Load Diff

@ -1,403 +0,0 @@
import json
import math
import argparse
from pathlib import Path
from transformers import BertTokenizer, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup, AutoConfig
import torch
from torch import device, nn
from import DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.utilities.seed import seed_everything
from transformers.tokenization_bert import BertTokenizerFast
from kge.model import KgeModel
from import load_checkpoint
from kge.util import sc
# from relphormer.lit_models import TransformerLitModel
from relphormer.models import BertKGC
# from import KGC
import os
MODEL = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL)
class FBQADataset(Dataset):
def __init__(self, file_dir):
self.examples = json.load(Path(file_dir).open("rb"))
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
return self.examples[idx]
def fbqa_collate(samples):
questions = []
answers = []
answer_ids = []
entities = []
entity_names = []
relations = []
for item in samples:
q = item["RawQuestion"] + "[MASK]" * len(item["AnswerEntity"]) + "."
questions = tokenizer(questions, return_tensors='pt', padding=True)
entity_names = tokenizer(entity_names, add_special_tokens=False)
answers, answers_lengths = sc.pad_seq_of_seq(answers)
answers = torch.LongTensor(answers)
answers_lengths = torch.LongTensor(answers_lengths)
answer_ids = torch.LongTensor(answer_ids)
input_ids = questions['input_ids']
masked_labels = torch.ones_like(input_ids) * -100
masked_labels[input_ids == tokenizer.mask_token_id] = answers[answers != 0]
entity_mask = torch.zeros_like(input_ids).bool()
entity_span_index = input_ids.new_zeros((len(input_ids), 2))
for i, e_tokens in enumerate(entity_names['input_ids']):
q_tokens = input_ids[i].tolist()
for s_index in range(len(q_tokens) - len(e_tokens)):
if all([e_token == q_tokens[s_index + j] for j, e_token in enumerate(e_tokens)]):
entity_mask[i][s_index:s_index + len(e_tokens)] = True
entity_span_index[i][0] = s_index
entity_span_index[i][1] = s_index + len(e_tokens) - 1
entities = torch.LongTensor(entities)
relations = torch.LongTensor(relations)
return, masked_labels, answers, answers_lengths, answer_ids, entities, relations, entity_mask, entity_span_index
class SelfOutput(nn.Module):
def __init__(self, config):
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class CrossAttention(nn.Module):
def __init__(self, config, ctx_hidden_size):
self.self = CrossAttentionInternal(config, ctx_hidden_size)
self.output = SelfOutput(config)
self.config = config
def _init_weights(self, module):
""" Initialize the weights """
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf, std=self.config.initializer_range)
elif isinstance(module, nn.LayerNorm):
if isinstance(module, nn.Linear) and module.bias is not None:
def forward(
self_outputs = self.self(
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class CrossAttentionInternal(nn.Module):
def __init__(self, config, ctx_hidden_size):
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(ctx_hidden_size, self.all_head_size)
self.value = nn.Linear(ctx_hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
mixed_query_layer = self.query(hidden_states)
# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
mixed_key_layer = self.key(encoder_hidden_states)
mixed_value_layer = self.value(encoder_hidden_states)
attention_mask = encoder_attention_mask
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, nn.Softmax(dim=-1)(attention_scores)) if output_attentions else (context_layer,)
return outputs
class CrossTrmFinetuner(pl.LightningModule):
def __init__(self, hparams, bertmodel):
self._hparams = hparams = hparams['lr']
self.weight_decay = hparams['weight_decay']
self.kg_dim = 320
# self.bert = BertForMaskedLM.from_pretrained(MODEL)
self.bert = bertmodel
if self._hparams['use_hitter']:
self.kg_layer_num = 10
self.cross_attentions = nn.ModuleList([CrossAttention(self.bert.config, self.kg_dim)
for _ in range(self.kg_layer_num)])
checkpoint = load_checkpoint('local/best/20200812-174221-trmeh-fb15k237-best/')
self.hitter = KgeModel.create_from(checkpoint)
def forward(self, batch):
sent_input, masked_labels, batch_labels, label_lens, answer_ids, s, p, entity_mask, entity_span_index = batch
if self._hparams['use_hitter']:
# kg_masks: [bs, 1, 1, length]
# kg_embeds: nlayer*[bs, length, dim]
kg_embeds, kg_masks = self.hitter('get_hitter_repr', s, p)
kg_attentions = [None] * 2 + [(self.cross_attentions[i], kg_embeds[(i + 2) // 2], kg_masks)
for i in range(self.kg_layer_num)]
kg_attentions = []
out = self.bert(kg_attentions=kg_attentions,
return out
def training_step(self, batch, batch_idx):
output = self(batch)
loss = output.loss
self.log('train_loss', loss, on_epoch=True, prog_bar=True)
return {'loss': loss}
def validation_step(self, batch, batch_idx):
batch_inputs, masked_labels, batch_labels, label_lens, answer_ids, s, p, entity_mask, _ = batch
output = self(batch)
input_tokens = batch_inputs["input_ids"].clone()
logits = output.logits[masked_labels != -100]
probs = logits.softmax(dim=-1)
values, predictions = probs.topk(1)
hits = []
now_pos = 0
for sample_i, label_length in enumerate(label_lens.tolist()):
failed = False
for i in range(label_length):
if (predictions[now_pos + i] == batch_labels[sample_i][i]).sum() != 1:
failed = True
hits += [1] if not failed else [0]
now_pos += label_length
hits = torch.tensor(hits)
input_tokens[input_tokens == tokenizer.mask_token_id] = predictions.flatten()
pred_strings = [str(hits[i].item()) + ' ' + tokenizer.decode(input_tokens[i], skip_special_tokens=True)
for i in range(input_tokens.size(0))]
return {'val_loss': output.loss,
'val_acc': hits.float(),
'pred_strings': pred_strings}
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
avg_val_acc =[x['val_acc'] for x in outputs]).mean().to(avg_loss.device)
if self.global_rank == 0:
tensorboard = self.logger.experiment
tensorboard.add_text('pred', '\n\n'.join(sum([x['pred_strings'] for x in outputs], [])), self.global_step)
self.log('avg_loss', avg_loss, on_epoch=True, prog_bar=True, sync_dist=True)
self.log('avg_val_acc', avg_val_acc, on_epoch=True, prog_bar=True, sync_dist=True)
return {'val_loss': avg_loss}
def train_dataloader(self):
return DataLoader(FBQADataset(self._hparams['train_dataset']),
def val_dataloader(self):
return DataLoader(FBQADataset(self._hparams['val_dataset']),
def test_dataloader(self):
return DataLoader(FBQADataset(self._hparams['test_dataset']),
def configure_optimizers(self):
no_decay = ['bias', 'LayerNorm.weight']
no_fine_tune = ['cross_attentions']
pgs = [{'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) and not any([i in n for i in no_fine_tune])],
'weight_decay': 0.01},
{'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) and not any([i in n for i in no_fine_tune])],
'weight_decay': 0.0}]
if self._hparams['use_hitter']:
pgs.append({'params': self.cross_attentions.parameters(), 'lr': 5e-5, 'weight_decay': 0.01})
# bert_optimizer = AdamW(pgs, lr=3e-5, weight_decay=1e-2)
bert_optimizer = AdamW(pgs,, weight_decay=self.weight_decay)
bert_scheduler = {
'scheduler': get_linear_schedule_with_warmup(bert_optimizer, self._hparams['max_steps'] // 10, self._hparams['max_steps']),
'interval': 'step',
'monitor': None
return [bert_optimizer], [bert_scheduler]
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--exp_name", default='default', nargs='?', help="Name of the experiment")
parser.add_argument('--dataset', choices=['fbqa', 'webqsp'], default='fbqa', help="fbqa or webqsp")
parser.add_argument('--filtered', default=False, action='store_true', help="Filtered or not")
parser.add_argument('--hitter', default=False, action='store_true', help="Use pretrained HittER or not")
parser.add_argument('--relphormer', default=False, action='store_true', help="Use pretrained relphormer or not")
parser.add_argument('--seed', default=333, type=int, help='Seed number')
parser.add_argument('--lr', default=3e-5, type=float, help='learning rate')
parser.add_argument('--weight_decay', default=1e-2, type=float, help='weight decay')
args = parser.parse_args()
QA_DATASET = args.dataset
if args.filtered and args.relphormer:
SUBSET = 'relphormer-filtered'
elif not args.filtered and args.relphormer:
SUBSET = 'relphormer'
elif args.filtered and not args.relphormer:
SUBSET = 'fb15k237-filtered'
SUBSET = 'fb15k237'
hparams = {
'use_hitter': args.hitter,
'relphormer': args.relphormer,
'weight_decay': args.weight_decay,
'batch_size': 16,
'max_epochs': 20,
'train_dataset': f'data/{QA_DATASET}/{SUBSET}/train.json',
'val_dataset': f'data/{QA_DATASET}/{SUBSET}/test.json',
'test_dataset': f'data/{QA_DATASET}/{SUBSET}/test.json',
if hparams['relphormer']:
MODEL = "./local/relphormer/"
config = AutoConfig.from_pretrained(MODEL)
bertmodel = BertForMaskedLM.from_pretrained(MODEL, config=config)
model = CrossTrmFinetuner(hparams, bertmodel=bertmodel)
bertmodel = BertForMaskedLM.from_pretrained(MODEL)
model = CrossTrmFinetuner(hparams, bertmodel=bertmodel)
model.hparams['max_steps'] = (len(model.train_dataloader().dataset) // hparams['batch_size'] + 1) * hparams['max_epochs']
base_path = '/tmp/hitbert-paper'
logger = TensorBoardLogger(base_path, args.exp_name)
checkpoint_callback = ModelCheckpoint(
dirpath=base_path + '/' + args.exp_name,
trainer = pl.Trainer(gpus=1, accelerator="ddp",
max_epochs=hparams['max_epochs'], max_steps=model.hparams['max_steps'],
gradient_clip_val=1.0, logger=logger,
callbacks=[LearningRateMonitor(), checkpoint_callback])
print("QA Task End!")

@ -1,8 +0,0 @@
# from transformers.models.bert.modeling_bert import BertForMaskedLM
from models.huggingface_relformer import BertForMaskedLM
class BertKGC(BertForMaskedLM):
def add_to_argparse(parser):
parser.add_argument("--pretrain", type=int, default=0, help="")
return parser

@ -1,10 +0,0 @@
for SEED in 111 222 333 444 555 666 777 888 999
# echo ${LR} ${WD}
python --dataset fbqa \
--relphormer \
--seed ${SEED} \
--exp_name relphormer-fbqa \
--lr 3e-5 \
--weight_decay 1e-2

@ -1,13 +0,0 @@
for SEED in 111 222 333 444 555 666 777 888 999
# echo ${LR} ${WD}
python --dataset fbqa \
--relphormer \
--filtered \
--seed ${SEED} \
--exp_name relphormer-filtered-fbqa \
--lr 3e-5 \
--weight_decay 1e-2

@ -1,10 +0,0 @@
for SEED in 222 333 444 555 666 777 888 999
python --dataset webqsp \
--relphormer \
--seed ${SEED} \
--exp_name relphormer-webqsp \
--lr 3e-5 \
--weight_decay 1e-2

@ -1,12 +0,0 @@
for SEED in 111 222 333 444 555 666 777 888 999
# echo ${LR} ${WD}
python --dataset webqsp \
--relphormer \
--filtered \
--seed ${SEED} \
--exp_name relphormer-filtered-webqsp \
--lr 3e-5 \
--weight_decay 1e-2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,6 +0,0 @@
"#examples": 1639,
"#kept_examples": 484,
"#mappable_examples": 484,
"#multiple_answer_examples": 800

File diff suppressed because it is too large Load Diff

@ -1,6 +0,0 @@
"#examples": 3098,
"#kept_examples": 850,
"#mappable_examples": 850,
"#multiple_answer_examples": 1437

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,6 +0,0 @@
"#examples": 1639,
"#kept_examples": 1582,
"#mappable_examples": 484,
"#multiple_answer_examples": 800

File diff suppressed because it is too large Load Diff

@ -1,6 +0,0 @@
"#examples": 3098,
"#kept_examples": 2997,
"#mappable_examples": 850,
"#multiple_answer_examples": 1437

File diff suppressed because it is too large Load Diff

@ -1,115 +0,0 @@
# Relphormer
Code for the paper: "Relphormer: Relational Graph Transformer for Knowledge Graph Representations".
> Transformers have achieved remarkable performance in widespread fields, including natural language processing, computer vision and graph mining. However, vanilla Transformer architectures have not yielded promising improvements in the Knowledge Graph (KG) representations, where the translational distance paradigm dominates this area. Note that vanilla Transformer architectures struggle to capture the intrinsically heterogeneous semantic and structural information of knowledge graphs. To this end, we propose a new variant of Transformer for knowledge graph representations dubbed Relphormer. Specifically, we introduce Triple2Seq which can dynamically sample contextualized sub-graph sequences as the input to alleviate the heterogeneity issue. We propose a novel structure-enhanced self-attention mechanism to encode the relational information and keep the globally semantic information among sub-graphs. Moreover, we propose masked knowledge modeling as a new paradigm for knowledge graph representation learning. We apply Relphormer to three tasks, namely, knowledge graph completion, KG-based question answering and KG-based recommendation for evaluation. Experimental results show that Relphormer can obtain better performance on benchmark datasets compared with baselines.
# Model Architecture
<div align=center>
<img src="./resource/model.png" width="85%" height="75%" />
The model architecture of Relphormer.
The contextualized sub-graph is sampled with Triple2Seq, and then it will be converted into sequences while maintaining its sub-graph structure.
Next, we conduct masked knowledge modeling, which randomly masks the nodes in the center triple in the contextualized sub-graph sequences.
For the transformer architecture, we design a novel structure-enhanced mechanism to preserve the structure feature.
Finally, we utilize our pre-trained KG transformer for KG-based downstream tasks.
# Environments
- python (3.8.13)
- cuda(11.2)
- Ubuntu-18.04.6 (4.15.0-156-generic)
# Requirements
To run the codes, you need to install the requirements:
pip install -r requirements.txt
The expected structure of files is:
── Relphormer
├── data
├── dataset
│   ├── FB15k-237
│   ├── WN18RR
│   ├── umls
│   ├──
├── lit_models
│   ├──
│   ├──
│   ├──
│   └──
├── models
│   ├──
│   ├──
│   ├──
│   └──
├── resource
│   └── model.png
├── scripts
│ ├── fb15k-237
│ ├── wn18rr
│   └── umls
├── QA
├── logs
└── requirements.txt
# How to run
## KGC Task
### Generate Masked Neighbors
- Use the command below to generate the masked neighbors.
>> cd dataset
>> python --dataset xxx # like python --dataset umls
### Entity Embedding Initialization
- Then use the command below to add entities to BERT and initialize the entity embedding layer to be used in the later training. For other datasets `FB15k-237` and `WN18RR` , just replace the dataset name with `fb15k-237` and `wn18rr` will be fine.
>> cd pretrain
>> mkdir logs
>> bash scripts/
>> tail -f -n 2000 logs/pretrain_umls.log
The pretrained models are saved in the `Relphormer/pretrain/output` directory.
### Entity Prediction
- Next use the command below to train the model to predict the correct entity in the masked position. Same as above for other datasets.
>> cd Relphormer
>> mkdir logs
>> bash scripts/umls/
>> tail -f -n 2000 logs/train_umls.log
The trained models are saved in the `Relphormer/output` directory.
## QA Task
The experimental settings in QA follow the [Hitter]( experimental settings, and the environment installation can be done by referring to [GitHub]( We only modified **** to fit our model.
- The relphormer model used by QA can be downloaded [here](
>> cd QA
>> sh scripts/
>> sh scripts/
>> sh scripts/
>> sh scripts/

View File

@ -0,0 +1,24 @@
"version": 1,
"disable_existing_loggers": false,
"formatters": {
"simple": {
"format": "%(asctime)s - %(name)s - [%(levelname)s] - %(message)s"
"handlers": {
"file_handler": {
"class": "logging.FileHandler",
"level": "DEBUG",
"formatter": "simple",
"filename": "python_logging.log",
"encoding": "utf8"
"root": {
"level": "DEBUG",
"handlers": [

@ -1,2 +0,0 @@
from .data_module import KGC
from .processor import convert_examples_to_features, KGProcessor

View File

@ -1,63 +0,0 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import cython
from cython.parallel cimport prange, parallel
cimport numpy
import numpy
def floyd_warshall(adjacency_matrix):
(nrows, ncols) = adjacency_matrix.shape
assert nrows == ncols
cdef unsigned int n = nrows
adj_mat_copy = adjacency_matrix.astype(long, order='C', casting='safe', copy=True)
assert adj_mat_copy.flags['C_CONTIGUOUS']
cdef numpy.ndarray[long, ndim=2, mode='c'] M = adj_mat_copy
cdef numpy.ndarray[long, ndim=2, mode='c'] path = numpy.zeros([n, n], dtype=numpy.int64)
cdef unsigned int i, j, k
cdef long M_ij, M_ik, cost_ikkj
cdef long* M_ptr = &M[0,0]
cdef long* M_i_ptr
cdef long* M_k_ptr
# set unreachable nodes distance to 510
for i in range(n):
for j in range(n):
if i == j:
M[i][j] = 0
elif M[i][j] == 0:
M[i][j] = 510
# floyed algo
for k in range(n):
M_k_ptr = M_ptr + n*k
for i in range(n):
M_i_ptr = M_ptr + n*i
M_ik = M_i_ptr[k]
for j in range(n):
cost_ikkj = M_ik + M_k_ptr[j]
M_ij = M_i_ptr[j]
if M_ij > cost_ikkj:
M_i_ptr[j] = cost_ikkj
path[i][j] = k
# set unreachable path to 510
for i in range(n):
for j in range(n):
if M[i][j] >= 510:
path[i][j] = 510
M[i][j] = 510
return M, path
def get_all_edges(path, i, j):
cdef unsigned int k = path[i][j]
if k == 0:
return []
return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)

@ -1,71 +0,0 @@
"""Base DataModule class."""
from pathlib import Path
from typing import Dict
import argparse
import os
import pytorch_lightning as pl
from import DataLoader
class Config(dict):
def __getattr__(self, name):
return self.get(name)
def __setattr__(self, name, val):
self[name] = val
class BaseDataModule(pl.LightningDataModule):
Base DataModule.
Learn more at
def __init__(self, args: argparse.Namespace = None) -> None:
self.args = Config(vars(args)) if args is not None else {}
self.batch_size = self.args.get("batch_size", BATCH_SIZE)
self.num_workers = self.args.get("num_workers", NUM_WORKERS)
def add_to_argparse(parser):
"--batch_size", type=int, default=BATCH_SIZE, help="Number of examples to operate on per forward step."
"--num_workers", type=int, default=0, help="Number of additional processes to load data."
"--dataset", type=str, default="./dataset/NELL", help="Number of additional processes to load data."
return parser
def prepare_data(self):
Use this method to do things that might write to disk or that need to be done only from a single GPU in distributed settings (so don't set state `self.x = y`).
def setup(self, stage=None):
Split into train, val, test, and set dims.
Should assign `torch Dataset` objects to self.data_train, self.data_val, and optionally self.data_test.
self.data_train = None
self.data_val = None
self.data_test = None
def train_dataloader(self):
return DataLoader(self.data_train, shuffle=True, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
def val_dataloader(self):
return DataLoader(self.data_val, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
def test_dataloader(self):
return DataLoader(self.data_test, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)

@ -1,195 +0,0 @@
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
from enum import Enum
import torch
from import DataLoader
from transformers import AutoTokenizer, BertTokenizer
# from transformers.configuration_bert import BertTokenizer, BertTokenizerFast
from transformers.tokenization_utils_base import (BatchEncoding,
from .base_data_module import BaseDataModule
from .processor import KGProcessor, get_dataset
import transformers
class ExplicitEnum(Enum):
Enum with more explicit error message for missing values.
def _missing_(cls, value):
raise ValueError(
f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
class PaddingStrategy(ExplicitEnum):
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
in an IDE.
LONGEST = "longest"
MAX_LENGTH = "max_length"
DO_NOT_PAD = "do_not_pad"
import numpy as np
class DataCollatorForSeq2Seq:
Data collator that will dynamically pad the inputs received, as well as the labels.
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data.
model (:class:`~transformers.PreTrainedModel`):
The model that is being trained. If set and has the `prepare_decoder_input_ids_from_labels`, use it to
prepare the `decoder_input_ids`
This is useful when using `label_smoothing` to avoid calculating loss twice.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence is provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
tokenizer: PreTrainedTokenizerBase
model: Optional[Any] = None
padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
label_pad_token_id: int = -100
return_tensors: str = "pt"
num_labels: int = 0
def __call__(self, features, return_tensors=None):
if return_tensors is None:
return_tensors = self.return_tensors
labels = [feature.pop("labels") for feature in features] if "labels" in features[0].keys() else None
label = [feature.pop("label") for feature in features]
features_keys = {}
name_keys = list(features[0].keys())
for k in name_keys:
# ignore the padding arguments
if k in ["input_ids", "attention_mask", "token_type_ids"]: continue
features_keys[k] = [feature.pop(k) for feature in features]
except KeyError:
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
# same length to return tensors.
bsz = len(labels)
with torch.no_grad():
new_labels = torch.zeros(bsz, self.num_labels)
for i,l in enumerate(labels):
if isinstance(l, int):
new_labels[i][l] = 1
for j in l:
new_labels[i][j] = 1
labels = new_labels
features = self.tokenizer.pad(
features['labels'] = labels
features['label'] = torch.tensor(label)
return features
class KGC(BaseDataModule):
def __init__(self, args, model) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(self.args.model_name_or_path, use_fast=False)
self.processor = KGProcessor(self.tokenizer, args)
self.label_list = self.processor.get_labels(args.data_dir)
entity_list = self.processor.get_entities(args.data_dir)
num_added_tokens = self.tokenizer.add_special_tokens({'additional_special_tokens': entity_list})
self.sampler = DataCollatorForSeq2Seq(self.tokenizer,
pad_to_multiple_of=8 if self.args.precision == 16 else None,
num_labels = len(entity_list),
relations_tokens = self.processor.get_relations(args.data_dir)
self.num_relations = len(relations_tokens)
num_added_tokens = self.tokenizer.add_special_tokens({'additional_special_tokens': relations_tokens})
vocab = self.tokenizer.get_added_vocab()
self.relation_id_st = vocab[relations_tokens[0]]
self.relation_id_ed = vocab[relations_tokens[-1]] + 1
self.entity_id_st = vocab[entity_list[0]]
self.entity_id_ed = vocab[entity_list[-1]] + 1
def setup(self, stage=None):
self.data_train = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "train")
self.data_val = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "dev")
self.data_test = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "test")
def prepare_data(self):
def get_config(self):
d = {}
for k, v in self.__dict__.items():
if "st" in k or "ed" in k:
return d
def add_to_argparse(parser):
parser.add_argument("--model_name_or_path", type=str, default="roberta-base", help="the name or the path to the pretrained model")
parser.add_argument("--data_dir", type=str, default="roberta-base", help="the name or the path to the pretrained model")
parser.add_argument("--max_seq_length", type=int, default=256, help="Number of examples to operate on per forward step.")
parser.add_argument("--warm_up_radio", type=float, default=0.1, help="Number of examples to operate on per forward step.")
parser.add_argument("--eval_batch_size", type=int, default=8)
parser.add_argument("--overwrite_cache", action="store_true", default=False)
return parser
def get_tokenizer(self):
return self.tokenizer
def train_dataloader(self):
return DataLoader(self.data_train, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.batch_size, shuffle=not self.args.faiss_init)
def val_dataloader(self):
return DataLoader(self.data_val, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.eval_batch_size)
def test_dataloader(self):
return DataLoader(self.data_test, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.eval_batch_size)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,954 +0,0 @@
from hashlib import new
from re import DEBUG
import contextlib
import sys
from collections import Counter
from multiprocessing import Pool
from torch._C import HOIST_CONV_PACKED_PARAMS
from import Dataset, Sampler, IterableDataset
from collections import defaultdict
from functools import partial
from multiprocessing import Pool
import os
import random
import json
import torch
import copy
import numpy as np
import pickle
from tqdm import tqdm
from dataclasses import dataclass, asdict, replace
import inspect
from import AutoTokenizer
from models.utils import get_entity_spans_pre_processing
import pyximport
pyximport.install(setup_args={'include_dirs': np.get_include()})
import data.algos as algos
def lmap(a, b):
return list(map(a,b)) # a是个函数b是个值列表返回函数值列表
def cache_results(_cache_fp, _refresh=False, _verbose=1):
import time
import numpy as np
from fastNLP import cache_results
def process_data():
# 一些比较耗时的工作比如读取数据预处理数据等这里用time.sleep()代替耗时
return np.random.randint(10, size=(5,))
start_time = time.time()
print("res =",process_data())
print(time.time() - start_time)
start_time = time.time()
print("res =",process_data())
print(time.time() - start_time)
# 输出内容如下,可以看到两次结果相同,且第二次几乎没有花费时间
# Save cache to cache.pkl.
# res = [5 4 9 1 8]
# 1.0042750835418701
# Read cache from cache.pkl.
# res = [5 4 9 1 8]
# 0.0040721893310546875
# 还是以上面的例子为例如果需要重新生成另一个cache比如另一个数据集的内容通过如下的方式调用即可
process_data(_cache_fp='cache2.pkl') # 完全不影响之前的cache.pkl'
'cache.pkl'如果在你的函数前面加上了@cache_results()则你的函数会增加三个参数[_cache_fp, _refresh, _verbose]
process_data(_cache_fp='cache2.pkl', _refresh=True) # 这里强制重新生成一份对预处理的cache。
# _verbose是用于控制输出信息的如果为0,则不输出任何内容;如果为1,则会提醒当前步骤是读取的cache还是生成了新的cache
:param str _cache_fp: 将返回结果缓存到什么位置;或从什么位置读取缓存如果为Nonecache_results没有任何效用除非在
:param bool _refresh: 是否重新生成cache
:param int _verbose: 是否打印cache的信息
def wrapper_(func):
signature = inspect.signature(func)
for key, _ in signature.parameters.items():
if key in ('_cache_fp', '_refresh', '_verbose'):
raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key))
def wrapper(*args, **kwargs):
my_args = args[0]
mode = args[-1]
if '_cache_fp' in kwargs:
cache_filepath = kwargs.pop('_cache_fp')
assert isinstance(cache_filepath, str), "_cache_fp can only be str."
cache_filepath = _cache_fp
if '_refresh' in kwargs:
refresh = kwargs.pop('_refresh')
assert isinstance(refresh, bool), "_refresh can only be bool."
refresh = _refresh
if '_verbose' in kwargs:
verbose = kwargs.pop('_verbose')
assert isinstance(verbose, int), "_verbose can only be integer."
verbose = _verbose
refresh_flag = True
model_name = my_args.model_name_or_path.split("/")[-1]
is_pretrain = my_args.pretrain
cache_filepath = os.path.join(my_args.data_dir, f"cached_{mode}_features{model_name}_pretrain{is_pretrain}_faiss{my_args.faiss_init}_seqlength{my_args.max_seq_length}_{my_args.litmodel_class}.pkl")
refresh = my_args.overwrite_cache
if cache_filepath is not None and refresh is False:
# load data
if os.path.exists(cache_filepath):
with open(cache_filepath, 'rb') as f:
results = pickle.load(f)
if verbose == 1:"Read cache from {}.".format(cache_filepath))
refresh_flag = False
if refresh_flag:
results = func(*args, **kwargs)
if cache_filepath is not None:
if results is None:
raise RuntimeError("The return value is None. Delete the decorator.")
with open(cache_filepath, 'wb') as f:
pickle.dump(results, f)"Save cache to {}.".format(cache_filepath))
return results
return wrapper
return wrapper_
import argparse
import csv
import logging
import os
import random
import sys
import numpy as np
import torch
from import (DataLoader, RandomSampler, SequentialSampler,
from import DistributedSampler
from tqdm import tqdm, trange
# from torch.nn import CrossEntropyLoss, MSELoss
# from scipy.stats import pearsonr, spearmanr
# from sklearn.metrics import matthews_corrcoef, f1_scoreclass
logger = logging.getLogger(__name__)
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, text_c=None, text_d=None, label=None, real_label=None, en=None, en_id=None, rel=None, text_d_id=None, graph_inf=None):
"""Constructs a InputExample.
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
text_c: (Optional) string. The untokenized text of the third sequence.
Only must be specified for sequence triple tasks.
label: (Optional) string. list of entities
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.text_c = text_c
self.text_d = text_d
self.label = label
self.real_label = real_label
self.en = en
self.rel = rel # rel id
self.text_d_id = text_d_id
self.graph_inf = graph_inf
self.en_id = en_id
class InputFeatures:
"""A single set of features of data."""
input_ids: torch.Tensor
attention_mask: torch.Tensor
labels: torch.Tensor = None
label: torch.Tensor = None
en: torch.Tensor = 0
rel: torch.Tensor = 0
pos: torch.Tensor = 0
graph: torch.Tensor = 0
distance_attention: torch.Tensor = 0
# attention_bias: torch.Tensor = 0
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_labels(self, data_dir):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
return lines
import copy
def solve_get_knowledge_store(line, set_type="train", pretrain=1):
use the LM to get the entity embedding.
Transductive: triples + text description
Inductive: text description
examples = []
head_ent_text = ent2text[line[0]]
tail_ent_text = ent2text[line[2]]
relation_text = rel2text[line[1]]
a = tail_filter_entities["\t".join([line[0],line[1]])]
b = head_filter_entities["\t".join([line[2],line[1]])]
guid = "%s-%s" % (set_type, i)
text_a = head_ent_text
text_b = relation_text
text_c = tail_ent_text
# use the description of c to predict A
InputExample(guid=guid, text_a="[PAD]", text_b=text_b + "[PAD]", text_c = "[PAD]" + " " + text_c, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[ent2id[line[0]], rel2id[line[1]], ent2id[line[2]]], rel=0)
InputExample(guid=guid, text_a="[PAD]", text_b=text_b + "[PAD]", text_c = "[PAD]" + " " + text_a, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[2]], en=[ent2id[line[0]], rel2id[line[1]], ent2id[line[2]]], rel=0)
return examples
def solve(line, set_type="train", pretrain=1, max_triplet=32):
examples = []
head_ent_text = ent2text[line[0]]
tail_ent_text = ent2text[line[2]]
relation_text = rel2text[line[1]]
a = tail_filter_entities["\t".join([line[0],line[1]])]
b = head_filter_entities["\t".join([line[2],line[1]])]
guid = "%s-%s" % (set_type, i)
text_a = head_ent_text
text_b = relation_text
text_c = tail_ent_text
if pretrain:
text_a_tokens = text_a.split()
for i in range(10):
st = random.randint(0, len(text_a_tokens))
InputExample(guid=guid, text_a="[MASK]", text_b=" ".join(text_a_tokens[st:min(st+64, len(text_a_tokens))]), text_c = "", label=ent2id[line[0]], real_label=ent2id[line[0]], en=0, rel=0)
InputExample(guid=guid, text_a="[MASK]", text_b=text_a, text_c = "", label=ent2id[line[0]], real_label=ent2id[line[0]], en=0, rel=0)
# examples.append(
# InputExample(guid=guid, text_a="[MASK]", text_b=text_c, text_c = "", label=ent2id[line[2]], real_label=ent2id[line[2]], en=0, rel=0)
# )
# 主要是对text_c进行包装不再是原来的文本而是对应子图的graph变量graph_seq。如果mask的是尾实体那么就让text_c在后面加入graph_seq
# masked_head_seq = []
# masked_tail_seq = []
# masked_tail_graph_list = masked_tail_neighbor["\t".join([line[0],line[1]])]
# masked_head_graph_list = masked_head_neighbor["\t".join([line[2],line[1]])]
# for item in masked_head_graph_list:
# masked_head_seq.append(ent2id[item[0]])
# masked_head_seq.append(rel2id[item[1]])
# masked_head_seq.append(ent2id[item[2]])
# for item in masked_tail_graph_list:
# masked_tail_seq.append(ent2id[item[0]])
# masked_tail_seq.append(rel2id[item[1]])
# masked_tail_seq.append(ent2id[item[2]])
masked_head_seq = set()
masked_head_seq_id = set()
masked_tail_seq = set()
masked_tail_seq_id = set()
masked_tail_graph_list = masked_tail_neighbor["\t".join([line[0],line[1]])] if len(masked_tail_neighbor["\t".join([line[0],line[1]])]) < max_triplet else \
random.sample(masked_tail_neighbor["\t".join([line[0],line[1]])], max_triplet)
masked_head_graph_list = masked_head_neighbor["\t".join([line[2],line[1]])] if len(masked_head_neighbor["\t".join([line[2],line[1]])]) < max_triplet else \
random.sample(masked_head_neighbor["\t".join([line[2],line[1]])], max_triplet)
# masked_tail_graph_list = masked_tail_neighbor["\t".join([line[0],line[1]])][:16]
# masked_head_graph_list = masked_head_neighbor["\t".join([line[2],line[1]])][:16]
for item in masked_head_graph_list:
for item in masked_tail_graph_list:
# print(masked_tail_seq)
masked_head_seq = masked_head_seq.difference({line[0]})
masked_head_seq = masked_head_seq.difference({line[2]})
masked_head_seq = masked_head_seq.difference({line[1]})
masked_head_seq_id = masked_head_seq_id.difference({ent2id[line[0]]})
masked_head_seq_id = masked_head_seq_id.difference({rel2id[line[1]]})
masked_head_seq_id = masked_head_seq_id.difference({ent2id[line[2]]})
masked_tail_seq = masked_tail_seq.difference({line[0]})
masked_tail_seq = masked_tail_seq.difference({line[2]})
masked_tail_seq = masked_tail_seq.difference({line[1]})
masked_tail_seq_id = masked_tail_seq_id.difference({ent2id[line[0]]})
masked_tail_seq_id = masked_tail_seq_id.difference({rel2id[line[1]]})
masked_tail_seq_id = masked_tail_seq_id.difference({ent2id[line[2]]})
# examples.append(
# InputExample(guid=guid, text_a="[MASK]", text_b=' '.join(text_b.split(' ')[:16]) + " [PAD]", text_c = "[PAD]" + " " + ' '.join(text_c.split(' ')[:16]), text_d = masked_head_seq, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[rel2id[line[1]], ent2id[line[2]]], rel=rel2id[line[1]]))
# examples.append(
# InputExample(guid=guid, text_a="[PAD] ", text_b=' '.join(text_b.split(' ')[:16]) + " [PAD]", text_c = "[MASK]" +" " + ' '.join(text_a.split(' ')[:16]), text_d = masked_tail_seq, label=lmap(lambda x: ent2id[x], a), real_label=ent2id[line[2]], en=[ent2id[line[0]], rel2id[line[1]]], rel=rel2id[line[1]]))
InputExample(guid=guid, text_a="[MASK]", text_b="[PAD]", text_c = "[PAD]", text_d = list(masked_head_seq), label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[line[1], line[2]], en_id = [rel2id[line[1]], ent2id[line[2]]], rel=rel2id[line[1]], text_d_id = list(masked_head_seq_id), graph_inf = masked_head_graph_list))
InputExample(guid=guid, text_a="[PAD]", text_b="[PAD]", text_c = "[MASK]", text_d = list(masked_tail_seq), label=lmap(lambda x: ent2id[x], a), real_label=ent2id[line[2]], en=[line[0], line[1]], en_id = [ent2id[line[0]], rel2id[line[1]]], rel=rel2id[line[1]], text_d_id = list(masked_tail_seq_id), graph_inf = masked_tail_graph_list))
return examples
def filter_init(head, tail, t1,t2, ent2id_, ent2token_, rel2id_, masked_head_neighbor_, masked_tail_neighbor_, rel2token_):
global head_filter_entities
global tail_filter_entities
global ent2text
global rel2text
global ent2id
global ent2token
global rel2id
global masked_head_neighbor
global masked_tail_neighbor
global rel2token
head_filter_entities = head
tail_filter_entities = tail
ent2text =t1
rel2text =t2
ent2id = ent2id_
ent2token = ent2token_
rel2id = rel2id_
masked_head_neighbor = masked_head_neighbor_
masked_tail_neighbor = masked_tail_neighbor_
rel2token = rel2token_
def delete_init(ent2text_):
global ent2text
ent2text = ent2text_
class KGProcessor(DataProcessor):
"""Processor for knowledge graph data set."""
def __init__(self, tokenizer, args):
self.labels = set()
self.tokenizer = tokenizer
self.args = args
self.entity_path = os.path.join(args.data_dir, "entity2textlong.txt") if os.path.exists(os.path.join(args.data_dir, 'entity2textlong.txt')) \
else os.path.join(args.data_dir, "entity2text.txt")
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", data_dir, self.args)
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", data_dir, self.args)
def get_test_examples(self, data_dir, chunk=""):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, f"test{chunk}.tsv")), "test", data_dir, self.args)
def get_relations(self, data_dir):
"""Gets all labels (relations) in the knowledge graph."""
# return list(self.labels)
with open(os.path.join(data_dir, "relations.txt"), 'r') as f:
lines = f.readlines()
relations = []
for line in lines:
rel2token = {ent : f"[RELATION_{i}]" for i, ent in enumerate(relations)}
return list(rel2token.values())
def get_labels(self, data_dir):
"""Gets all labels (0, 1) for triples in the knowledge graph."""
relation = []
with open(os.path.join(data_dir, "relation2text.txt"), 'r') as f:
lines = f.readlines()
entities = []
for line in lines:
return relation
def get_entities(self, data_dir):
"""Gets all entities in the knowledge graph."""
with open(self.entity_path, 'r') as f:
lines = f.readlines()
entities = []
for line in lines:
ent2token = {ent : f"[ENTITY_{i}]" for i, ent in enumerate(entities)}
return list(ent2token.values())
def get_train_triples(self, data_dir):
"""Gets training triples."""
return self._read_tsv(os.path.join(data_dir, "train.tsv"))
def get_dev_triples(self, data_dir):
"""Gets validation triples."""
return self._read_tsv(os.path.join(data_dir, "dev.tsv"))
def get_test_triples(self, data_dir, chunk=""):
"""Gets test triples."""
return self._read_tsv(os.path.join(data_dir, f"test{chunk}.tsv"))
def _create_examples(self, lines, set_type, data_dir, args):
"""Creates examples for the training and dev sets."""
# entity to text
ent2text = {}
ent2text_with_type = {}
with open(self.entity_path, 'r') as f:
ent_lines = f.readlines()
for line in ent_lines:
temp = line.strip().split('\t')
end = temp[1]#.find(',')
if "wiki" in data_dir:
assert "Q" in temp[0]
ent2text[temp[0]] = temp[1].replace("\\n", " ").replace("\\", "") #[:end]
except IndexError:
# continue
end = " "#.find(',')
if "wiki" in data_dir:
assert "Q" in temp[0]
ent2text[temp[0]] = end #[:end]
entities = list(ent2text.keys())
ent2token = {ent : f"[ENTITY_{i}]" for i, ent in enumerate(entities)}
ent2id = {ent : i for i, ent in enumerate(entities)}
rel2text = {}
with open(os.path.join(data_dir, "relation2text.txt"), 'r') as f:
rel_lines = f.readlines()
for line in rel_lines:
temp = line.strip().split('\t')
rel2text[temp[0]] = temp[1]
relation_names = {}
with open(os.path.join(data_dir, "relations.txt"), "r") as file:
for line in file.readlines():
t = line.strip()
relation_names[t] = rel2text[t]
tmp_lines = []
not_in_text = 0
for line in tqdm(lines, desc="delete entities without text name."):
if (line[0] not in ent2text) or (line[2] not in ent2text) or (line[1] not in rel2text):
not_in_text += 1
lines = tmp_lines
print(f"total entity not in text : {not_in_text} ")
relations = list(rel2text.keys())
rel2token = {rel : f"[RELATION_{i}]" for i, rel in enumerate(relations)}
# rel id -> relation token id
num_entities = len(self.get_entities(args.data_dir))
rel2id = {w:i+num_entities for i,w in enumerate(relation_names.keys())}
with open(os.path.join(data_dir, "masked_head_neighbor.txt"), 'r') as file:
masked_head_neighbor = json.load(file)
with open(os.path.join(data_dir, "masked_tail_neighbor.txt"), 'r') as file:
masked_tail_neighbor = json.load(file)
examples = []
# head filter head entity
head_filter_entities = defaultdict(list)
tail_filter_entities = defaultdict(list)
dataset_list = ["train.tsv", "dev.tsv", "test.tsv"]
# in training, only use the train triples
if set_type == "train" and not args.pretrain: dataset_list = dataset_list[0:1]
for m in dataset_list:
with open(os.path.join(data_dir, m), 'r') as file:
train_lines = file.readlines()
for idx in range(len(train_lines)):
train_lines[idx] = train_lines[idx].strip().split("\t")
for line in train_lines:
tail_filter_entities["\t".join([line[0], line[1]])].append(line[2])
head_filter_entities["\t".join([line[2], line[1]])].append(line[0])
max_head_entities = max(len(_) for _ in head_filter_entities.values())
max_tail_entities = max(len(_) for _ in tail_filter_entities.values())
# use bce loss, ignore the mlm
if set_type == "train" and args.bce:
lines = []
for k, v in tail_filter_entities.items():
h, r = k.split('\t')
t = v[0]
lines.append([h, r, t])
for k, v in head_filter_entities.items():
t, r = k.split('\t')
h = v[0]
lines.append([h, r, t])
# for training , select each entity as for get mask embedding.
if args.pretrain:
rel = list(rel2text.keys())[0]
lines = []
for k in ent2text.keys():
lines.append([k, rel, k])
print(f"max number of filter entities : {max_head_entities} {max_tail_entities}")
# 把子图信息加入到filter_init中初始化为文件夹及固定子图设置为全局变量solve中调用
from os import cpu_count
threads = min(1, cpu_count())
filter_init(head_filter_entities, tail_filter_entities,ent2text, rel2text, ent2id, ent2token, rel2id, masked_head_neighbor, masked_tail_neighbor, rel2token
if hasattr(args, "faiss_init") and args.faiss_init:
annotate_ = partial(
annotate_ = partial(
examples = list(
map(annotate_, lines),
desc="convert text to examples"
tmp_examples = []
for e in examples:
for ee in e:
examples = tmp_examples
# delete vars
del head_filter_entities, tail_filter_entities, ent2text, rel2text, ent2id, ent2token, rel2id
return examples
class Verbalizer(object):
def __init__(self, args):
if "WN18RR" in args.data_dir:
self.mode = "WN18RR"
elif "FB15k" in args.data_dir:
self.mode = "FB15k"
elif "umls" in args.data_dir:
self.mode = "umls"
elif "codexs" in args.data_dir:
self.mode = "codexs"
elif "FB13" in args.data_dir:
self.mode = "FB13"
elif "WN11" in args.data_dir:
self.mode = "WN11"
def _convert(self, head, relation, tail):
if self.mode == "umls":
return f"The {relation} {head} is "
return f"{head} {relation}"
class KGCDataset(Dataset):
def __init__(self, features):
self.features = features
def __getitem__(self, index):
return self.features[index]
def __len__(self):
return len(self.features)
def convert_examples_to_features_init(tokenizer_for_convert):
global tokenizer
tokenizer = tokenizer_for_convert
def convert_examples_to_features(example, max_seq_length, mode, pretrain=1):
"""Loads a data file into a list of `InputBatch`s."""
text_a = " ".join(example.text_a.split()[:128])
text_b = " ".join(example.text_b.split()[:128])
text_c = " ".join(example.text_c.split()[:128])
if pretrain:
input_text_a = text_a
input_text_b = text_b
input_text_a = " ".join([text_a, text_b])
input_text_b = text_c
inputs = tokenizer(
# assert tokenizer.mask_token_id in inputs.input_ids, "mask token must in input"
features = asdict(InputFeatures(input_ids=inputs["input_ids"],
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
if len(tokens_a) > len(tokens_b):
def _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length):
"""Truncates a sequence triple in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b) + len(tokens_c)
if total_length <= max_length:
if len(tokens_a) > len(tokens_b) and len(tokens_a) > len(tokens_c):
elif len(tokens_b) > len(tokens_a) and len(tokens_b) > len(tokens_c):
elif len(tokens_c) > len(tokens_a) and len(tokens_c) > len(tokens_b):
def get_dataset(args, processor, label_list, tokenizer, mode):
assert mode in ["train", "dev", "test"], "mode must be in train dev test!"
# use training data to construct the entity embedding
combine_train_and_test = False
if args.faiss_init and mode == "test" and not args.pretrain:
mode = "train"
if "ind" in args.data_dir: combine_train_and_test = True
if mode == "train":
train_examples = processor.get_train_examples(args.data_dir)
elif mode == "dev":
train_examples = processor.get_dev_examples(args.data_dir)
train_examples = processor.get_test_examples(args.data_dir)
if combine_train_and_test:"use all the dataset for getting the entity mask embedding in pretraining pretraining")"use all the dataset for getting the entity mask embedding in pretraining pretraining")
train_examples = processor.get_test_examples(args.data_dir) + processor.get_train_examples(args.data_dir) + processor.get_dev_examples(args.data_dir)
from os import cpu_count
with open(os.path.join(args.data_dir, f"examples_{mode}.txt"), 'w') as file:
for line in train_examples:
d = {}
file.write(json.dumps(d) + '\n')
# 这里应该不需要重新from_pretrain必须沿用加入token的
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=False)
features = []
file_inputs = [os.path.join(args.data_dir, f"examples_{mode}.txt")]
file_outputs = [os.path.join(args.data_dir, f"features_{mode}.txt")]
with contextlib.ExitStack() as stack:
inputs = [
stack.enter_context(open(input, "r", encoding="utf-8"))
if input != "-" else sys.stdin
for input in file_inputs
outputs = [
stack.enter_context(open(output, "w", encoding="utf-8"))
if output != "-" else sys.stdout
for output in file_outputs
encoder = MultiprocessingEncoder(tokenizer, args)
pool = Pool(16, initializer=encoder.initializer)
encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 1000)
# encoded_lines = map(encoder.encode_lines, zip(*inputs))
stats = Counter()
for i, (filt, enc_lines) in tqdm(enumerate(encoded_lines, start=1), total=len(train_examples)):
if filt == "PASS":
for enc_line, output_h in zip(enc_lines, outputs):
# features.append(enc_line)
# print(enc_line, file=output_h)
stats["num_filtered_" + filt] += 1
for k, v in stats.most_common():
print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
for f_id, f in enumerate(features):
en = features[f_id].pop("en")
rel = features[f_id].pop("rel")
graph = features[f_id].pop("graph")
real_label = f['label']
features[f_id]['distance_attention'] = torch.Tensor(features[f_id]['distance_attention'])
cnt = 0
cnt_2 = 0
if not isinstance(en, list): break
pos = 0
for i,t in enumerate(f['input_ids']):
if t == tokenizer.pad_token_id:
features[f_id]['input_ids'][i] = en[cnt] + len(tokenizer)
cnt += 1
if t == tokenizer.unk_token_id:
features[f_id]['input_ids'][i] = graph[cnt_2] + len(tokenizer)
cnt_2 += 1
if features[f_id]['input_ids'][i] == real_label + len(tokenizer):
pos = i
if cnt_2 == len(graph) and cnt == len(en): break
# 如果等于UNK pop出图节点list然后替换
assert not (args.faiss_init and pos == 0)
features[f_id]['pos'] = pos
# for i,t in enumerate(f['input_ids']):
# if t == tokenizer.pad_token_id:
# features[f_id]['input_ids'][i] = rel + len(tokenizer) + num_entities
# break
features = KGCDataset(features)
return features
class MultiprocessingEncoder(object):
def __init__(self, tokenizer, args):
self.tokenizer = tokenizer
self.pretrain = args.pretrain
self.max_seq_length = args.max_seq_length
def initializer(self):
global bpe
bpe = self.tokenizer
def encode(self, line):
global bpe
ids = bpe.encode(line)
return list(map(str, ids))
def decode(self, tokens):
global bpe
return bpe.decode(tokens)
def encode_lines(self, lines):
Encode a set of lines. All lines will be encoded together.
enc_lines = []
for line in lines:
line = line.strip()
if len(line) == 0:
return ["EMPTY", None]
# enc_lines.append(" ".join(tokens))
# enc_lines.append(" ")
# enc_lines.append("123")
return ["PASS", enc_lines]
def decode_lines(self, lines):
dec_lines = []
for line in lines:
tokens = map(int, line.strip().split())
return ["PASS", dec_lines]
def convert_examples_to_features(self, example):
pretrain = self.pretrain
max_seq_length = self.max_seq_length
global bpe
"""Loads a data file into a list of `InputBatch`s."""
# tokens_a = tokenizer.tokenize(example.text_a)
# tokens_b = tokenizer.tokenize(example.text_b)
# tokens_c = tokenizer.tokenize(example.text_c)
# _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length= max_seq_length)
# text_a = " ".join(example['text_a'].split()[:128])
# text_b = " ".join(example['text_b'].split()[:128])
# text_c = " ".join(example['text_c'].split()[:128])
text_a = example['text_a']
text_b = example['text_b']
text_c = example['text_c']
text_d = example['text_d']
graph_list = example['graph_inf']
if pretrain:
# the des of xxx is [MASK] .
input_text = f"The description of {text_a} is that {text_b} ."
inputs = bpe(
if text_a == "[MASK]":
input_text_a = " ".join([text_a, text_b])
input_text_b = text_c
origin_triplet = ["MASK"] + example['en']
graph_seq = ["MASK"] + example['en'] + text_d
input_text_a = text_a
input_text_b = " ".join([text_b, text_c])
origin_triplet = example['en'] + ["MASK"]
graph_seq = example['en'] + ["MASK"] + text_d
# 加入graph信息, 拼接等量[UNK]
input_text_b = " ".join(["[CLS]", input_text_a, input_text_b, bpe.unk_token * len(text_d)])
inputs = bpe(
# assert bpe.mask_token_id in inputs.input_ids, "mask token must in input"
# graph_seq = input_text_b[] 把图结构信息读取出来
# [CLS] [ENTITY_13258] [RELATION_68] [MASK] [ENTITY_4] [RELATION_127] [ENTITY_8] [RELATION_9] [ENTITY_9011] [ENTITY_12477] [PAD] [PAD]
# 获取图结构信息
# 首先在solve中加入一个存储所有子图三元组的临时存储变量
# 在这里graph_information = example['graph']
new_rel = set()
new_rel.add(tuple((origin_triplet[0], origin_triplet[1])))
new_rel.add(tuple((origin_triplet[1], origin_triplet[0])))
new_rel.add(tuple((origin_triplet[1], origin_triplet[2])))
new_rel.add(tuple((origin_triplet[2], origin_triplet[1])))
for triplet in graph_list:
rel1, rel2, rel3, rel4 = tuple((triplet[0], triplet[1])), tuple((triplet[1], triplet[2])), tuple((triplet[1], triplet[0])), tuple((triplet[2], triplet[1]))
# 这里的三元组转换为new_rel
KGid2Graphid_map = defaultdict(int)
for i in range(len(graph_seq)):
KGid2Graphid_map[graph_seq[i]] = i
N = len(graph_seq)
adj = torch.zeros([N, N], dtype=torch.bool)
for item in list(new_rel):
adj[KGid2Graphid_map[item[0]], KGid2Graphid_map[item[1]]] = True
shortest_path_result, _ = algos.floyd_warshall(adj.numpy())
max_dist = np.amax(shortest_path_result)
# [PAD]部分, [CLS]部分补全, [SEP]额外引入也当作[PAD]处理
# 加上一个attention_bias, PAD部分设置为-inf在送入model前对其进行处理, 将其相加让模型无法关注PAD
# 加入attention到huggingface的BertForMaskedLM这个可能需要再去查查
# attention_bias =, N, dtype=torch.float)
# attention_bias[torch.tensor(shortest_path_result == )]
features = asdict(InputFeatures(input_ids=inputs["input_ids"],
distance_attention = shortest_path_result.tolist(),
return features

View File

@ -0,0 +1,136 @@
import torch
import numpy as np
from import Dataset
class TrainDataset(Dataset):
Training Dataset class.
triples: The triples used for training the model
params: Parameters for the experiments
A training Dataset class instance used by DataLoader
def __init__(self, triples, params):
self.triples = triples
self.p = params
self.strategy = self.p.train_strategy
self.entities = np.arange(self.p.num_ent, dtype=np.int32)
def __len__(self):
return len(self.triples)
def __getitem__(self, idx):
ele = self.triples[idx]
triple, label, sub_samp = torch.LongTensor(ele['triple']), np.int32(
ele['label']), np.float32(ele['sub_samp'])
trp_label = self.get_label(label)
if self.p.lbl_smooth != 0.0:
trp_label = (1.0 - self.p.lbl_smooth) * \
trp_label + (1.0/self.p.num_ent)
if self.strategy == 'one_to_n':
return triple, trp_label, None, None
elif self.strategy == 'one_to_x':
sub_samp = torch.FloatTensor([sub_samp])
neg_ent = torch.LongTensor(self.get_neg_ent(triple, label))
return triple, trp_label, neg_ent, sub_samp
raise NotImplementedError
def collate_fn(data):
triple = torch.stack([_[0] for _ in data], dim=0)
trp_label = torch.stack([_[1] for _ in data], dim=0)
if not data[0][2] is None: # one_to_x
neg_ent = torch.stack([_[2] for _ in data], dim=0)
sub_samp =[_[3] for _ in data], dim=0)
return triple, trp_label, neg_ent, sub_samp
return triple, trp_label
def get_neg_ent(self, triple, label):
def get(triple, label):
if self.strategy == 'one_to_x':
pos_obj = triple[2]
mask = np.ones([self.p.num_ent], dtype=np.bool)
mask[label] = 0
neg_ent = np.int32(np.random.choice(
self.entities[mask], self.p.neg_num, replace=False)).reshape([-1])
neg_ent = np.concatenate((pos_obj.reshape([-1]), neg_ent))
pos_obj = label
mask = np.ones([self.p.num_ent], dtype=np.bool)
mask[label] = 0
neg_ent = np.int32(np.random.choice(
self.entities[mask], self.p.neg_num - len(label), replace=False)).reshape([-1])
neg_ent = np.concatenate((pos_obj.reshape([-1]), neg_ent))
if len(neg_ent) > self.p.neg_num:
import pdb
return neg_ent
neg_ent = get(triple, label)
return neg_ent
def get_label(self, label):
if self.strategy == 'one_to_n':
y = np.zeros([self.p.num_ent], dtype=np.float32)
for e2 in label:
y[e2] = 1.0
elif self.strategy == 'one_to_x':
y = [1] + [0] * self.p.neg_num
raise NotImplementedError
return torch.FloatTensor(y)
class TestDataset(Dataset):
Evaluation Dataset class.
triples: The triples used for evaluating the model
params: Parameters for the experiments
An evaluation Dataset class instance used by DataLoader for model evaluation
def __init__(self, triples, params):
self.triples = triples
self.p = params
def __len__(self):
return len(self.triples)
def __getitem__(self, idx):
ele = self.triples[idx]
triple, label = torch.LongTensor(ele['triple']), np.int32(ele['label'])
label = self.get_label(label)
return triple, label
def collate_fn(data):
triple = torch.stack([_[0] for _ in data], dim=0)
label = torch.stack([_[1] for _ in data], dim=0)
return triple, label
def get_label(self, label):
y = np.zeros([self.p.num_ent], dtype=np.float32)
for e2 in label:
y[e2] = 1.0
return torch.FloatTensor(y)

@ -1,6 +0,0 @@
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,237 +0,0 @@
/soccer/football_team/current_roster./soccer/football_roster_position/position 0
/music/artist/origin 1
/ice_hockey/hockey_team/current_roster./sports/sports_team_roster/position 2
/food/food/nutrients./food/nutrition_fact/nutrient 3
/film/actor/film./film/performance/film 4
/award/award_nominee/award_nominations./award/award_nomination/nominated_for 5
/government/political_party/politicians_in_this_party./government/political_party_tenure/politician 6
/base/schemastaging/person_extra/net_worth./measurement_unit/dated_money_value/currency 7
/people/deceased_person/place_of_death 8
/people/person/profession 9
/location/administrative_division/first_level_division_of 10
/base/marchmadness/ncaa_basketball_tournament/seeds./base/marchmadness/ncaa_tournament_seed/team 11
/education/university/international_tuition./measurement_unit/dated_money_value/currency 12
/location/us_county/county_seat 13
/location/location/partially_contains 14
/tv/tv_program/program_creator 15
/film/film/music 16
/tv/tv_program/languages 17
/common/topic/webpage./common/webpage/category 18
/user/tsegaran/random/taxonomy_subject/entry./user/tsegaran/random/taxonomy_entry/taxonomy 19
/education/field_of_study/students_majoring./education/education/major_field_of_study 20
/business/business_operation/assets./measurement_unit/dated_money_value/currency 21
/film/film_set_designer/film_sets_designed 22
/dataworld/gardening_hint/split_to 23
/people/person/languages 24
/business/job_title/people_with_this_title./business/employment_tenure/company 25
/location/country/form_of_government 26
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_language 27
/people/person/place_of_birth 28
/sports/sports_team/colors 29
/education/educational_institution/school_type 30
/award/award_category/winners./award/award_honor/award_winner 31
/organization/organization/headquarters./location/mailing_address/citytown 32
/education/educational_degree/people_with_this_degree./education/education/student 33
/government/legislative_session/members./government/government_position_held/legislative_sessions 34
/film/film/distributors./film/film_film_distributor_relationship/film_distribution_medium 35
/education/educational_degree/people_with_this_degree./education/education/major_field_of_study 36
/location/hud_county_place/county 37
/location/administrative_division/country 38
/film/film/film_production_design_by 39
/award/award_winning_work/awards_won./award/award_honor/award 40
/organization/organization/headquarters./location/mailing_address/state_province_region 41
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/contact_category 42
/tv/tv_program/country_of_origin 43
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/medal 44
/location/country/second_level_divisions 45
/award/award_ceremony/awards_presented./award/award_honor/honored_for 46
/organization/organization_member/member_of./organization/organization_membership/organization 47
/education/educational_institution/campuses 48
/music/artist/contribution./music/recording_contribution/performance_role 49
/award/ranked_item/appears_in_ranked_lists./award/ranking/list 50
/people/person/religion 51
/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month 52
/film/special_film_performance_type/film_performance_type./film/performance/film 53
/award/award_nominee/award_nominations./award/award_nomination/award 54
/location/statistical_region/religions./location/religion_percentage/religion 55
/sports/sports_league_draft/picks./sports/sports_league_draft_pick/school 56
/film/film/distributors./film/film_film_distributor_relationship/region 57
/government/politician/government_positions_held./government/government_position_held/legislative_sessions 58
/organization/role/leaders./organization/leadership/organization 59
/tv/tv_network/programs./tv/tv_network_duration/program 60
/soccer/football_team/current_roster./sports/sports_team_roster/position 61
/music/instrument/instrumentalists 62
/business/business_operation/operating_income./measurement_unit/dated_money_value/currency 63
/people/cause_of_death/people 64
/film/film/film_art_direction_by 65
/people/person/sibling_s./people/sibling_relationship/sibling 66
/film/film/cinematography 67
/film/actor/dubbing_performances./film/dubbing_performance/language 68
/base/biblioness/bibs_location/state 69
/base/petbreeds/city_with_dogs/top_breeds./base/petbreeds/dog_city_relationship/dog_breed 70
/people/person/gender 71
/education/field_of_study/students_majoring./education/education/student 72
/base/popstra/celebrity/dated./base/popstra/dated/participant 73
/sports/sports_team/roster./american_football/football_roster_position/position 74
/award/award_winner/awards_won./award/award_honor/award_winner 75
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/olympics 76
/film/director/film 77
/tv/tv_producer/programs_produced./tv/tv_producer_term/program 78
/film/film_distributor/films_distributed./film/film_film_distributor_relationship/film 79
/olympics/olympic_games/sports 80
/music/record_label/artist 81
/education/university/local_tuition./measurement_unit/dated_money_value/currency 82
/film/film/story_by 83
/people/person/spouse_s./people/marriage/spouse 84
/sports/sports_league/teams./sports/sports_league_participation/team 85
/people/profession/specialization_of 86
/base/americancomedy/celebrity_impressionist/celebrities_impersonated 87
/tv/tv_program/genre 88
/award/award_category/nominees./award/award_nomination/nominated_for 89
/language/human_language/countries_spoken_in 90
/organization/organization/headquarters./location/mailing_address/country 91
/location/statistical_region/gdp_real./measurement_unit/adjusted_money_value/adjustment_currency 92
/education/university/fraternities_and_sororities 93
/award/award_nominee/award_nominations./award/award_nomination/award_nominee 94
/military/military_combatant/military_conflicts./military/military_combatant_group/combatants 95
/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for 96
/location/location/time_zones 97
/film/film/dubbing_performances./film/dubbing_performance/actor 98
/film/film_subject/films 99
/education/educational_degree/people_with_this_degree./education/education/institution 100
/education/educational_institution/colors 101
/award/award_category/category_of 102
/tv/tv_personality/tv_regular_appearances./tv/tv_regular_personal_appearance/program 103
/film/film/language 104
/music/group_member/membership./music/group_membership/group 105
/business/business_operation/revenue./measurement_unit/dated_money_value/currency 106
/film/film/film_festivals 107
/film/actor/film./film/performance/special_performance_type 108
/organization/non_profit_organization/registered_with./organization/non_profit_registration/registering_agency 109
/government/politician/government_positions_held./government/government_position_held/jurisdiction_of_office 110
/base/aareas/schema/administrative_area/administrative_parent 111
/award/award_winning_work/awards_won./award/award_honor/award_winner 112
/organization/organization/place_founded 113
/soccer/football_player/current_team./sports/sports_team_roster/team 114
/government/politician/government_positions_held./government/government_position_held/basic_title 115
/music/artist/track_contributions./music/track_contribution/role 116
/base/localfood/seasonal_month/produce_available./base/localfood/produce_availability/seasonal_months 117
/celebrities/celebrity/celebrity_friends./celebrities/friendship/friend 118
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/school 119
/award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee 120
/influence/influence_node/peers./influence/peer_relationship/peers 121
/medicine/disease/risk_factors 122
/broadcast/content/artist 123
/film/film/estimated_budget./measurement_unit/dated_money_value/currency 124
/military/military_conflict/combatants./military/military_combatant_group/combatants 125
/location/capital_of_administrative_division/capital_of./location/administrative_division_capital_relationship/administrative_division 126
/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor 127
/people/deceased_person/place_of_burial 128
/location/location/adjoin_s./location/adjoining_relationship/adjoins 129
/music/group_member/membership./music/group_membership/role 130
/award/award_ceremony/awards_presented./award/award_honor/award_winner 131
/film/film/prequel 132
/film/film/produced_by 133
/tv/tv_program/tv_producer./tv/tv_producer_term/producer_type 134
/sports/sports_position/players./sports/sports_team_roster/team 135
/olympics/olympic_games/participating_countries 136
/music/genre/parent_genre 137
/tv/tv_writer/tv_programs./tv/tv_program_writer_relationship/tv_program 138
/music/genre/artists 139
/film/film/genre 140
/people/person/employment_history./business/employment_tenure/company 141
/education/university/domestic_tuition./measurement_unit/dated_money_value/currency 142
/people/person/nationality 143
/location/country/capital 144
/location/statistical_region/gni_per_capita_in_ppp_dollars./measurement_unit/dated_money_value/currency 145
/base/aareas/schema/administrative_area/capital 146
/business/business_operation/industry 147
/location/hud_foreclosure_area/estimated_number_of_mortgages./measurement_unit/dated_integer/source 148
/film/film/other_crew./film/film_crew_gig/crewmember 149
/base/popstra/location/vacationers./base/popstra/vacation_choice/vacationer 150
/film/film/film_format 151
/medicine/disease/notable_people_with_this_condition 152
/film/film/costume_design_by 153
/government/government_office_category/officeholders./government/government_position_held/jurisdiction_of_office 154
/location/statistical_region/gdp_nominal./measurement_unit/dated_money_value/currency 155
/sports/sports_team/roster./baseball/baseball_roster_position/position 156
/award/award_winning_work/awards_won./award/award_honor/honored_for 157
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/olympics 158
/celebrities/celebrity/sexual_relationships./celebrities/romantic_relationship/celebrity 159
/people/marriage_union_type/unions_of_this_type./people/marriage/location_of_ceremony 160
/organization/organization/child./organization/organization_relationship/child 161
/organization/organization_founder/organizations_founded 162
/sports/sports_team/sport 163
/people/ethnicity/geographic_distribution 164
/location/statistical_region/places_exported_to./location/imports_and_exports/exported_to 165
/location/country/official_language 166
/film/film/production_companies 167
/user/jg/default_domain/olympic_games/sports 168
/time/event/locations 169
/people/person/spouse_s./people/marriage/type_of_union 170
/government/governmental_body/members./government/government_position_held/legislative_sessions 171
/media_common/netflix_genre/titles 172
/user/alexander/philosophy/philosopher/interests 173
/film/film/runtime./film/film_cut/film_release_region 174
/education/educational_institution/students_graduates./education/education/student 175
/base/eating/practicer_of_diet/diet 176
/tv/non_character_role/tv_regular_personal_appearances./tv/tv_regular_personal_appearance/person 177
/sports/sports_position/players./sports/sports_team_roster/position 178
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/draft 179
/medicine/symptom/symptom_of 180
/film/person_or_entity_appearing_in_film/films./film/personal_film_appearance/type_of_appearance 181
/sports/sports_team_location/teams 182
/american_football/football_team/current_roster./sports/sports_team_roster/position 183
/people/person/places_lived./people/place_lived/location 184
/location/statistical_region/rent50_2./measurement_unit/dated_money_value/currency 185
/film/film/personal_appearances./film/personal_film_appearance/person 186
/music/instrument/family 187
/sports/sports_team/roster./basketball/basketball_roster_position/position 188
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_location 189
/film/film/release_date_s./film/film_regional_release_date/film_release_region 190
/award/award_category/disciplines_or_subjects 191
/base/popstra/celebrity/friendship./base/popstra/friendship/participant 192
/music/performance_role/regular_performances./music/group_membership/group 193
/film/film/edited_by 194
/base/x2010fifaworldcupsouthafrica/world_cup_squad/current_world_cup_squad./base/x2010fifaworldcupsouthafrica/current_world_cup_squad/current_club 195
/base/popstra/celebrity/canoodled./base/popstra/canoodled/participant 196
/film/film/release_date_s./film/film_regional_release_date/film_release_distribution_medium 197
/film/film/other_crew./film/film_crew_gig/film_crew_role 198
/base/popstra/celebrity/breakup./base/popstra/breakup/participant 199
/film/film/country 200
/music/performance_role/regular_performances./music/group_membership/role 201
/sports/sports_team/roster./american_football/football_historical_roster_position/position_s 202
/film/film/release_date_s./film/film_regional_release_date/film_regional_debut_venue 203
/time/event/instance_of_recurring_event 204
/olympics/olympic_participating_country/athletes./olympics/olympic_athlete_affiliation/olympics 205
/organization/endowed_organization/endowment./measurement_unit/dated_money_value/currency 206
/travel/travel_destination/how_to_get_here./travel/transportation/mode_of_transportation 207
/baseball/baseball_team/team_stats./baseball/baseball_team_stats/season 208
/award/award_category/winners./award/award_honor/ceremony 209
/government/legislative_session/members./government/government_position_held/district_represented 210
/influence/influence_node/influenced_by 211
/base/culturalevent/event/entity_involved 212
/people/ethnicity/people 213
/sports/sport/pro_athletes./sports/pro_sports_played/athlete 214
/location/statistical_region/gdp_nominal_per_capita./measurement_unit/dated_money_value/currency 215
/location/hud_county_place/place 216
/base/aareas/schema/administrative_area/administrative_area_type 217
/base/locations/continents/countries_within 218
/sports/sports_position/players./american_football/football_historical_roster_position/position_s 219
/people/person/spouse_s./people/marriage/location_of_ceremony 220
/education/educational_institution/students_graduates./education/education/major_field_of_study 221
/film/film/written_by 222
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/country 223
/music/performance_role/guest_performances./music/recording_contribution/performance_role 224
/film/film/featured_film_locations 225
/education/educational_institution_campus/educational_institution 226
/sports/pro_athlete/teams./sports/sports_team_roster/team 227
/people/ethnicity/languages_spoken 228
/film/film/executive_produced_by 229
/tv/tv_producer/programs_produced./tv/tv_producer_term/producer_type 230
/location/location/contains 231
/base/biblioness/bibs_location/country 232
/user/ktrueman/default_domain/international_organization/member_states 233
/music/performance_role/track_performances./music/track_contribution/role 234
/olympics/olympic_games/medals_awarded./olympics/olympic_medal_honor/medal 235
/base/saturdaynightlive/snl_cast_member/seasons./base/saturdaynightlive/snl_season_tenure/cast_members 236

@ -1,237 +0,0 @@
/soccer/football_team/current_roster./soccer/football_roster_position/position soccer football team current roster. soccer football roster position position
/music/artist/origin music artist origin
/ice_hockey/hockey_team/current_roster./sports/sports_team_roster/position ice hockey hockey team current roster. sports sports team roster position
/food/food/nutrients./food/nutrition_fact/nutrient food food nutrients. food nutrition fact nutrient
/film/actor/film./film/performance/film film actor film. film performance film
/award/award_nominee/award_nominations./award/award_nomination/nominated_for award award nominee award nominations. award award nomination nominated for
/government/political_party/politicians_in_this_party./government/political_party_tenure/politician government political party politicians in this party. government political party tenure politician
/base/schemastaging/person_extra/net_worth./measurement_unit/dated_money_value/currency base schemastaging person extra net worth. measurement unit dated money value currency
/people/deceased_person/place_of_death people deceased person place of death
/people/person/profession people person profession
/location/administrative_division/first_level_division_of location administrative division first level division of
/base/marchmadness/ncaa_basketball_tournament/seeds./base/marchmadness/ncaa_tournament_seed/team base marchmadness ncaa basketball tournament seeds. base marchmadness ncaa tournament seed team
/education/university/international_tuition./measurement_unit/dated_money_value/currency education university international tuition. measurement unit dated money value currency
/location/us_county/county_seat location us county county seat
/location/location/partially_contains location location partially contains
/tv/tv_program/program_creator tv tv program program creator
/film/film/music film film music
/tv/tv_program/languages tv tv program languages
/common/topic/webpage./common/webpage/category common topic webpage. common webpage category
/user/tsegaran/random/taxonomy_subject/entry./user/tsegaran/random/taxonomy_entry/taxonomy user tsegaran random taxonomy subject entry. user tsegaran random taxonomy entry taxonomy
/education/field_of_study/students_majoring./education/education/major_field_of_study education field of study students majoring. education education major field of study
/business/business_operation/assets./measurement_unit/dated_money_value/currency business business operation assets. measurement unit dated money value currency
/film/film_set_designer/film_sets_designed film film set designer film sets designed
/dataworld/gardening_hint/split_to dataworld gardening hint split to
/people/person/languages people person languages
/business/job_title/people_with_this_title./business/employment_tenure/company business job title people with this title. business employment tenure company
/location/country/form_of_government location country form of government
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_language base schemastaging organization extra phone number. base schemastaging phone sandbox service language
/people/person/place_of_birth people person place of birth
/sports/sports_team/colors sports sports team colors
/education/educational_institution/school_type education educational institution school type
/award/award_category/winners./award/award_honor/award_winner award award category winners. award award honor award winner
/organization/organization/headquarters./location/mailing_address/citytown organization organization headquarters. location mailing address citytown
/education/educational_degree/people_with_this_degree./education/education/student education educational degree people with this degree. education education student
/government/legislative_session/members./government/government_position_held/legislative_sessions government legislative session members. government government position held legislative sessions
/film/film/distributors./film/film_film_distributor_relationship/film_distribution_medium film film distributors. film film film distributor relationship film distribution medium
/education/educational_degree/people_with_this_degree./education/education/major_field_of_study education educational degree people with this degree. education education major field of study
/location/hud_county_place/county location hud county place county
/location/administrative_division/country location administrative division country
/film/film/film_production_design_by film film film production design by
/award/award_winning_work/awards_won./award/award_honor/award award award winning work awards won. award award honor award
/organization/organization/headquarters./location/mailing_address/state_province_region organization organization headquarters. location mailing address state province region
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/contact_category base schemastaging organization extra phone number. base schemastaging phone sandbox contact category
/tv/tv_program/country_of_origin tv tv program country of origin
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/medal olympics olympic participating country medals won. olympics olympic medal honor medal
/location/country/second_level_divisions location country second level divisions
/award/award_ceremony/awards_presented./award/award_honor/honored_for award award ceremony awards presented. award award honor honored for
/organization/organization_member/member_of./organization/organization_membership/organization organization organization member member of. organization organization membership organization
/education/educational_institution/campuses education educational institution campuses
/music/artist/contribution./music/recording_contribution/performance_role music artist contribution. music recording contribution performance role
/award/ranked_item/appears_in_ranked_lists./award/ranking/list award ranked item appears in ranked lists. award ranking list
/people/person/religion people person religion
/travel/travel_destination/climate./travel/travel_destination_monthly_climate/month travel travel destination climate. travel travel destination monthly climate month
/film/special_film_performance_type/film_performance_type./film/performance/film film special film performance type film performance type. film performance film
/award/award_nominee/award_nominations./award/award_nomination/award award award nominee award nominations. award award nomination award
/location/statistical_region/religions./location/religion_percentage/religion location statistical region religions. location religion percentage religion
/sports/sports_league_draft/picks./sports/sports_league_draft_pick/school sports sports league draft picks. sports sports league draft pick school
/film/film/distributors./film/film_film_distributor_relationship/region film film distributors. film film film distributor relationship region
/government/politician/government_positions_held./government/government_position_held/legislative_sessions government politician government positions held. government government position held legislative sessions
/organization/role/leaders./organization/leadership/organization organization role leaders. organization leadership organization
/tv/tv_network/programs./tv/tv_network_duration/program tv tv network programs. tv tv network duration program
/soccer/football_team/current_roster./sports/sports_team_roster/position soccer football team current roster. sports sports team roster position
/music/instrument/instrumentalists music instrument instrumentalists
/business/business_operation/operating_income./measurement_unit/dated_money_value/currency business business operation operating income. measurement unit dated money value currency
/people/cause_of_death/people people cause of death people
/film/film/film_art_direction_by film film film art direction by
/people/person/sibling_s./people/sibling_relationship/sibling people person sibling s. people sibling relationship sibling
/film/film/cinematography film film cinematography
/film/actor/dubbing_performances./film/dubbing_performance/language film actor dubbing performances. film dubbing performance language
/base/biblioness/bibs_location/state base biblioness bibs location state
/base/petbreeds/city_with_dogs/top_breeds./base/petbreeds/dog_city_relationship/dog_breed base petbreeds city with dogs top breeds. base petbreeds dog city relationship dog breed
/people/person/gender people person gender
/education/field_of_study/students_majoring./education/education/student education field of study students majoring. education education student
/base/popstra/celebrity/dated./base/popstra/dated/participant base popstra celebrity dated. base popstra dated participant
/sports/sports_team/roster./american_football/football_roster_position/position sports sports team roster. american football football roster position position
/award/award_winner/awards_won./award/award_honor/award_winner award award winner awards won. award award honor award winner
/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/olympics olympics olympic participating country medals won. olympics olympic medal honor olympics
/film/director/film film director film
/tv/tv_producer/programs_produced./tv/tv_producer_term/program tv tv producer programs produced. tv tv producer term program
/film/film_distributor/films_distributed./film/film_film_distributor_relationship/film film film distributor films distributed. film film film distributor relationship film
/olympics/olympic_games/sports olympics olympic games sports
/music/record_label/artist music record label artist
/education/university/local_tuition./measurement_unit/dated_money_value/currency education university local tuition. measurement unit dated money value currency
/film/film/story_by film film story by
/people/person/spouse_s./people/marriage/spouse people person spouse s. people marriage spouse
/sports/sports_league/teams./sports/sports_league_participation/team sports sports league teams. sports sports league participation team
/people/profession/specialization_of people profession specialization of
/base/americancomedy/celebrity_impressionist/celebrities_impersonated base americancomedy celebrity impressionist celebrities impersonated
/tv/tv_program/genre tv tv program genre
/award/award_category/nominees./award/award_nomination/nominated_for award award category nominees. award award nomination nominated for
/language/human_language/countries_spoken_in language human language countries spoken in
/organization/organization/headquarters./location/mailing_address/country organization organization headquarters. location mailing address country
/location/statistical_region/gdp_real./measurement_unit/adjusted_money_value/adjustment_currency location statistical region gdp real. measurement unit adjusted money value adjustment currency
/education/university/fraternities_and_sororities education university fraternities and sororities
/award/award_nominee/award_nominations./award/award_nomination/award_nominee award award nominee award nominations. award award nomination award nominee
/military/military_combatant/military_conflicts./military/military_combatant_group/combatants military military combatant military conflicts. military military combatant group combatants
/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for award award nominated work award nominations. award award nomination nominated for
/location/location/time_zones location location time zones
/film/film/dubbing_performances./film/dubbing_performance/actor film film dubbing performances. film dubbing performance actor
/film/film_subject/films film film subject films
/education/educational_degree/people_with_this_degree./education/education/institution education educational degree people with this degree. education education institution
/education/educational_institution/colors education educational institution colors
/award/award_category/category_of award award category category of
/tv/tv_personality/tv_regular_appearances./tv/tv_regular_personal_appearance/program tv tv personality tv regular appearances. tv tv regular personal appearance program
/film/film/language film film language
/music/group_member/membership./music/group_membership/group music group member membership. music group membership group
/business/business_operation/revenue./measurement_unit/dated_money_value/currency business business operation revenue. measurement unit dated money value currency
/film/film/film_festivals film film film festivals
/film/actor/film./film/performance/special_performance_type film actor film. film performance special performance type
/organization/non_profit_organization/registered_with./organization/non_profit_registration/registering_agency organization non profit organization registered with. organization non profit registration registering agency
/government/politician/government_positions_held./government/government_position_held/jurisdiction_of_office government politician government positions held. government government position held jurisdiction of office
/base/aareas/schema/administrative_area/administrative_parent base aareas schema administrative area administrative parent
/award/award_winning_work/awards_won./award/award_honor/award_winner award award winning work awards won. award award honor award winner
/organization/organization/place_founded organization organization place founded
/soccer/football_player/current_team./sports/sports_team_roster/team soccer football player current team. sports sports team roster team
/government/politician/government_positions_held./government/government_position_held/basic_title government politician government positions held. government government position held basic title
/music/artist/track_contributions./music/track_contribution/role music artist track contributions. music track contribution role
/base/localfood/seasonal_month/produce_available./base/localfood/produce_availability/seasonal_months base localfood seasonal month produce available. base localfood produce availability seasonal months
/celebrities/celebrity/celebrity_friends./celebrities/friendship/friend celebrities celebrity celebrity friends. celebrities friendship friend
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/school sports professional sports team draft picks. sports sports league draft pick school
/award/hall_of_fame/inductees./award/hall_of_fame_induction/inductee award hall of fame inductees. award hall of fame induction inductee
/influence/influence_node/peers./influence/peer_relationship/peers influence influence node peers. influence peer relationship peers
/medicine/disease/risk_factors medicine disease risk factors
/broadcast/content/artist broadcast content artist
/film/film/estimated_budget./measurement_unit/dated_money_value/currency film film estimated budget. measurement unit dated money value currency
/military/military_conflict/combatants./military/military_combatant_group/combatants military military conflict combatants. military military combatant group combatants
/location/capital_of_administrative_division/capital_of./location/administrative_division_capital_relationship/administrative_division location capital of administrative division capital of. location administrative division capital relationship administrative division
/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor tv tv program regular cast. tv regular tv appearance actor
/people/deceased_person/place_of_burial people deceased person place of burial
/location/location/adjoin_s./location/adjoining_relationship/adjoins location location adjoin s. location adjoining relationship adjoins
/music/group_member/membership./music/group_membership/role music group member membership. music group membership role
/award/award_ceremony/awards_presented./award/award_honor/award_winner award award ceremony awards presented. award award honor award winner
/film/film/prequel film film prequel
/film/film/produced_by film film produced by
/tv/tv_program/tv_producer./tv/tv_producer_term/producer_type tv tv program tv producer. tv tv producer term producer type
/sports/sports_position/players./sports/sports_team_roster/team sports sports position players. sports sports team roster team
/olympics/olympic_games/participating_countries olympics olympic games participating countries
/music/genre/parent_genre music genre parent genre
/tv/tv_writer/tv_programs./tv/tv_program_writer_relationship/tv_program tv tv writer tv programs. tv tv program writer relationship tv program
/music/genre/artists music genre artists
/film/film/genre film film genre
/people/person/employment_history./business/employment_tenure/company people person employment history. business employment tenure company
/education/university/domestic_tuition./measurement_unit/dated_money_value/currency education university domestic tuition. measurement unit dated money value currency
/people/person/nationality people person nationality
/location/country/capital location country capital
/location/statistical_region/gni_per_capita_in_ppp_dollars./measurement_unit/dated_money_value/currency location statistical region gni per capita in ppp dollars. measurement unit dated money value currency
/base/aareas/schema/administrative_area/capital base aareas schema administrative area capital
/business/business_operation/industry business business operation industry
/location/hud_foreclosure_area/estimated_number_of_mortgages./measurement_unit/dated_integer/source location hud foreclosure area estimated number of mortgages. measurement unit dated integer source
/film/film/other_crew./film/film_crew_gig/crewmember film film other crew. film film crew gig crewmember
/base/popstra/location/vacationers./base/popstra/vacation_choice/vacationer base popstra location vacationers. base popstra vacation choice vacationer
/film/film/film_format film film film format
/medicine/disease/notable_people_with_this_condition medicine disease notable people with this condition
/film/film/costume_design_by film film costume design by
/government/government_office_category/officeholders./government/government_position_held/jurisdiction_of_office government government office category officeholders. government government position held jurisdiction of office
/location/statistical_region/gdp_nominal./measurement_unit/dated_money_value/currency location statistical region gdp nominal. measurement unit dated money value currency
/sports/sports_team/roster./baseball/baseball_roster_position/position sports sports team roster. baseball baseball roster position position
/award/award_winning_work/awards_won./award/award_honor/honored_for award award winning work awards won. award award honor honored for
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/olympics olympics olympic sport athletes. olympics olympic athlete affiliation olympics
/celebrities/celebrity/sexual_relationships./celebrities/romantic_relationship/celebrity celebrities celebrity sexual relationships. celebrities romantic relationship celebrity
/people/marriage_union_type/unions_of_this_type./people/marriage/location_of_ceremony people marriage union type unions of this type. people marriage location of ceremony
/organization/organization/child./organization/organization_relationship/child organization organization child. organization organization relationship child
/organization/organization_founder/organizations_founded organization organization founder organizations founded
/sports/sports_team/sport sports sports team sport
/people/ethnicity/geographic_distribution people ethnicity geographic distribution
/location/statistical_region/places_exported_to./location/imports_and_exports/exported_to location statistical region places exported to. location imports and exports exported to
/location/country/official_language location country official language
/film/film/production_companies film film production companies
/user/jg/default_domain/olympic_games/sports user jg default domain olympic games sports
/time/event/locations time event locations
/people/person/spouse_s./people/marriage/type_of_union people person spouse s. people marriage type of union
/government/governmental_body/members./government/government_position_held/legislative_sessions government governmental body members. government government position held legislative sessions
/media_common/netflix_genre/titles media common netflix genre titles
/user/alexander/philosophy/philosopher/interests user alexander philosophy philosopher interests
/film/film/runtime./film/film_cut/film_release_region film film runtime. film film cut film release region
/education/educational_institution/students_graduates./education/education/student education educational institution students graduates. education education student
/base/eating/practicer_of_diet/diet base eating practicer of diet diet
/tv/non_character_role/tv_regular_personal_appearances./tv/tv_regular_personal_appearance/person tv non character role tv regular personal appearances. tv tv regular personal appearance person
/sports/sports_position/players./sports/sports_team_roster/position sports sports position players. sports sports team roster position
/sports/professional_sports_team/draft_picks./sports/sports_league_draft_pick/draft sports professional sports team draft picks. sports sports league draft pick draft
/medicine/symptom/symptom_of medicine symptom symptom of
/film/person_or_entity_appearing_in_film/films./film/personal_film_appearance/type_of_appearance film person or entity appearing in film films. film personal film appearance type of appearance
/sports/sports_team_location/teams sports sports team location teams
/american_football/football_team/current_roster./sports/sports_team_roster/position american football football team current roster. sports sports team roster position
/people/person/places_lived./people/place_lived/location people person places lived. people place lived location
/location/statistical_region/rent50_2./measurement_unit/dated_money_value/currency location statistical region rent50 2. measurement unit dated money value currency
/film/film/personal_appearances./film/personal_film_appearance/person film film personal appearances. film personal film appearance person
/music/instrument/family music instrument family
/sports/sports_team/roster./basketball/basketball_roster_position/position sports sports team roster. basketball basketball roster position position
/base/schemastaging/organization_extra/phone_number./base/schemastaging/phone_sandbox/service_location base schemastaging organization extra phone number. base schemastaging phone sandbox service location
/film/film/release_date_s./film/film_regional_release_date/film_release_region film film release date s. film film regional release date film release region
/award/award_category/disciplines_or_subjects award award category disciplines or subjects
/base/popstra/celebrity/friendship./base/popstra/friendship/participant base popstra celebrity friendship. base popstra friendship participant
/music/performance_role/regular_performances./music/group_membership/group music performance role regular performances. music group membership group
/film/film/edited_by film film edited by
/base/x2010fifaworldcupsouthafrica/world_cup_squad/current_world_cup_squad./base/x2010fifaworldcupsouthafrica/current_world_cup_squad/current_club base x2010fifaworldcupsouthafrica world cup squad current world cup squad. base x2010fifaworldcupsouthafrica current world cup squad current club
/base/popstra/celebrity/canoodled./base/popstra/canoodled/participant base popstra celebrity canoodled. base popstra canoodled participant
/film/film/release_date_s./film/film_regional_release_date/film_release_distribution_medium film film release date s. film film regional release date film release distribution medium
/film/film/other_crew./film/film_crew_gig/film_crew_role film film other crew. film film crew gig film crew role
/base/popstra/celebrity/breakup./base/popstra/breakup/participant base popstra celebrity breakup. base popstra breakup participant
/film/film/country film film country
/music/performance_role/regular_performances./music/group_membership/role music performance role regular performances. music group membership role
/sports/sports_team/roster./american_football/football_historical_roster_position/position_s sports sports team roster. american football football historical roster position position s
/film/film/release_date_s./film/film_regional_release_date/film_regional_debut_venue film film release date s. film film regional release date film regional debut venue
/time/event/instance_of_recurring_event time event instance of recurring event
/olympics/olympic_participating_country/athletes./olympics/olympic_athlete_affiliation/olympics olympics olympic participating country athletes. olympics olympic athlete affiliation olympics
/organization/endowed_organization/endowment./measurement_unit/dated_money_value/currency organization endowed organization endowment. measurement unit dated money value currency
/travel/travel_destination/how_to_get_here./travel/transportation/mode_of_transportation travel travel destination how to get here. travel transportation mode of transportation
/baseball/baseball_team/team_stats./baseball/baseball_team_stats/season baseball baseball team team stats. baseball baseball team stats season
/award/award_category/winners./award/award_honor/ceremony award award category winners. award award honor ceremony
/government/legislative_session/members./government/government_position_held/district_represented government legislative session members. government government position held district represented
/influence/influence_node/influenced_by influence influence node influenced by
/base/culturalevent/event/entity_involved base culturalevent event entity involved
/people/ethnicity/people people ethnicity people
/sports/sport/pro_athletes./sports/pro_sports_played/athlete sports sport pro athletes. sports pro sports played athlete
/location/statistical_region/gdp_nominal_per_capita./measurement_unit/dated_money_value/currency location statistical region gdp nominal per capita. measurement unit dated money value currency
/location/hud_county_place/place location hud county place place
/base/aareas/schema/administrative_area/administrative_area_type base aareas schema administrative area administrative area type
/base/locations/continents/countries_within base locations continents countries within
/sports/sports_position/players./american_football/football_historical_roster_position/position_s sports sports position players. american football football historical roster position position s
/people/person/spouse_s./people/marriage/location_of_ceremony people person spouse s. people marriage location of ceremony
/education/educational_institution/students_graduates./education/education/major_field_of_study education educational institution students graduates. education education major field of study
/film/film/written_by film film written by
/olympics/olympic_sport/athletes./olympics/olympic_athlete_affiliation/country olympics olympic sport athletes. olympics olympic athlete affiliation country
/music/performance_role/guest_performances./music/recording_contribution/performance_role music performance role guest performances. music recording contribution performance role
/film/film/featured_film_locations film film featured film locations
/education/educational_institution_campus/educational_institution education educational institution campus educational institution
/sports/pro_athlete/teams./sports/sports_team_roster/team sports pro athlete teams. sports sports team roster team
/people/ethnicity/languages_spoken people ethnicity languages spoken
/film/film/executive_produced_by film film executive produced by
/tv/tv_producer/programs_produced./tv/tv_producer_term/producer_type tv tv producer programs produced. tv tv producer term producer type
/location/location/contains location location contains
/base/biblioness/bibs_location/country base biblioness bibs location country
/user/ktrueman/default_domain/international_organization/member_states user ktrueman default domain international organization member states
/music/performance_role/track_performances./music/track_contribution/role music performance role track performances. music track contribution role
/olympics/olympic_games/medals_awarded./olympics/olympic_medal_honor/medal olympics olympic games medals awarded. olympics olympic medal honor medal
/base/saturdaynightlive/snl_cast_member/seasons./base/saturdaynightlive/snl_season_tenure/cast_members base saturdaynightlive snl cast member seasons. base saturdaynightlive snl season tenure cast members

@ -1,237 +0,0 @@

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,155 +0,0 @@
"cells": [
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"path1 = './entities.txt'\n",
"path2 = './relations.txt'\n",
"path3 = './train.tsv'\n",
"path4 = './dev.tsv'\n",
"path5 = './test.tsv'\n",
"path6 = './get_neighbor/entity2id.txt'\n",
"path7 = './get_neighbor/relation2id.txt'\n",
"path8 = './get_neighbor/train2id.txt'\n",
"path9 = './get_neighbor/valid2id.txt'\n",
"path10 = './get_neighbor/test2id.txt'"
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"with open(path1, 'r') as f:\n",
" a = f.readlines()\n",
"cnt = 0\n",
"with open(path6, 'w') as f:\n",
" for line in a:\n",
" en = line.strip()\n",
" f.write(en + '\\t' + str(cnt) + '\\n')\n",
" cnt += 1\n",
" "
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"with open(path2, 'r') as f:\n",
" a = f.readlines()\n",
"cnt = 0\n",
"with open(path7, 'w') as f:\n",
" for line in a:\n",
" re = line.strip()\n",
" f.write(re + '\\t' + str(cnt) + '\\n')\n",
" cnt += 1\n",
" "
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"with open(path6, 'r') as f:\n",
" a = f.readlines()\n",
"en2id = {}\n",
"for line in a:\n",
" b = line.strip().split('\\t')\n",
" en, num = b[0], b[1]\n",
" en2id[en] = num"
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"with open(path7, 'r') as f:\n",
" a = f.readlines()\n",
"re2id = {}\n",
"for line in a:\n",
" b = line.strip().split('\\t')\n",
" re, num = b[0], b[1]\n",
" re2id[re] = num"
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"with open(path3, 'r') as f:\n",
" a = f.readlines()\n",
"with open(path8, 'w') as f:\n",
" for line in a:\n",
" b = line.strip().split('\\t')\n",
" h, r, t = b[0], b[1], b[2]\n",
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
" "
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"with open(path4, 'r') as f:\n",
" a = f.readlines()\n",
"with open(path9, 'w') as f:\n",
" for line in a:\n",
" b = line.strip().split('\\t')\n",
" h, r, t = b[0], b[1], b[2]\n",
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
" "
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(path5, 'r') as f:\n",
" a = f.readlines()\n",
"with open(path10, 'w') as f:\n",
" for line in a:\n",
" b = line.strip().split('\\t')\n",
" h, r, t = b[0], b[1], b[2]\n",
" f.write(en2id[h] + ' ' + re2id[r] + ' ' + en2id[t] + '\\n')\n",
" "
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"nbformat": 4,
"nbformat_minor": 2

@ -1,11 +0,0 @@
_member_of_domain_usage 0
_has_part 1
_also_see 2
_hypernym 3
_synset_domain_topic_of 4
_derivationally_related_form 5
_similar_to 6
_instance_hypernym 7
_verb_group 8
_member_meronym 9
_member_of_domain_region 10

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,11 +0,0 @@
_member_of_domain_usage member of domain usage
_has_part has part
_also_see also see
_hypernym hypernym
_synset_domain_topic_of synset domain topic of
_derivationally_related_form derivationally related form
_similar_to similar to
_instance_hypernym instance hypernym
_verb_group verb group
_member_meronym member meronym
_member_of_domain_region member of domain region

View File

@ -1,11 +0,0 @@

@ -1,151 +0,0 @@
from collections import defaultdict
import time
import argparse
id2entity_name = defaultdict(str)
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, default=None)
args = parser.parse_args()
# dataset_name = 'FB15k-237'
with open('./' + args.dataset + '/get_neighbor/entity2id.txt', 'r') as file:
entity_lines = file.readlines()
for line in entity_lines:
_name, _id = line.strip().split("\t")
id2entity_name[int(_id)] = _name
id2relation_name = defaultdict(str)
with open('./' + args.dataset + '/get_neighbor/relation2id.txt', 'r') as file:
relation_lines = file.readlines()
for line in relation_lines:
_name, _id = line.strip().split("\t")
id2relation_name[int(_id)] = _name
train_triplet = []
for line in open('./' + args.dataset + '/get_neighbor/train2id.txt', 'r'):
head, relation, tail = line.strip('\n').split()
train_triplet.append(list((int(head), int(relation), int(tail))))
for line in open('./' + args.dataset + '/get_neighbor/test2id.txt', 'r'):
head, relation, tail = line.strip('\n').split()
train_triplet.append(list((int(head), int(relation), int(tail))))
for line in open('./'+args.dataset+'/get_neighbor/valid2id.txt', 'r'):
head, relation, tail = line.strip('\n').split()
train_triplet.append(list((int(head), int(relation), int(tail))))
graph = {}
reverse_graph = {}
def init_graph(graph_triplet):
for triple in graph_triplet:
head = triple[0]
rela = triple[1]
tail = triple[2]
if(head not in graph.keys()):
graph[head] = {}
graph[head][tail] = rela
graph[head][tail] = rela
if(tail not in reverse_graph.keys()):
reverse_graph[tail] = {}
reverse_graph[tail][head] = rela
reverse_graph[tail][head] = rela
# return graph, reverse_graph, node_indegree, node_outdegree
import random
def random_delete(triplet, reserved_num):
reserved = random.sample(triplet, reserved_num)
return reserved
def get_onestep_neighbors(graph, source, sample_num):
triplet = []
nei = list(graph[source].keys())
# nei = random.sample(graph[source].keys(), sample_num)
triplet = [tuple((source, graph[source][nei[i]], nei[i])) for i in range(len(nei))]
except KeyError:
except ValueError:
nei = list(graph[source].keys())
triplet = [tuple((source, graph[source][nei[i]], nei[i])) for i in range(len(nei))]
return triplet
def get_entity_neighbors(traget_entity, max_triplet):
as_head_neighbors = get_onestep_neighbors(graph, traget_entity, max_triplet // 2)
as_tail_neighbors = get_onestep_neighbors(reverse_graph, traget_entity, max_triplet // 2)
all_triplet = as_head_neighbors + as_tail_neighbors
return all_triplet
def get_triplet(triplet):
head_entity = triplet[0]
tail_entity = triplet[2]
triplet = tuple((triplet[0], triplet[1], triplet[2]))
head_triplet = get_entity_neighbors(head_entity, 4)
tail_triplet = get_entity_neighbors(tail_entity, 4)
temp_triplet = list(set(head_triplet + tail_triplet))
temp_triplet = list(set(temp_triplet) - set([triplet]))
# if len(temp_triplet) > 8:
# del_triplet = list(set(temp_triplet) - set([triplet]))
# temp_triplet = random_delete(del_triplet, 7)
return temp_triplet
import copy
def change_(triplet_list):
tri_text = []
for item in triplet_list:
# text = id2entity_name[item[0]] + '\t' + id2relation_name[item[1]] + '\t' + id2entity_name[item[2]]
h = id2entity_name[item[0]]
r = id2relation_name[item[1]]
t = id2entity_name[item[2]]
tri_text.append([h, r, t])
return tri_text
mask_idx = 99999999
masked_tail_neighbor = defaultdict(list)
masked_head_neighbor = defaultdict(list)
for triplet in train_triplet:
tail_masked = copy.deepcopy(triplet)
head_masked = copy.deepcopy(triplet)
tail_masked[2] = mask_idx
head_masked[0] = mask_idx
masked_tail_neighbor['\t'.join([id2entity_name[triplet[0]], id2relation_name[triplet[1]]])] = change_(get_triplet(tail_masked))
masked_head_neighbor['\t'.join([id2entity_name[triplet[2]], id2relation_name[triplet[1]]])] = change_(get_triplet(head_masked))
import json
with open("./" + args.dataset + "/masked_tail_neighbor.txt", "w") as file:
file.write(json.dumps(masked_tail_neighbor, indent=1))
with open("./" + args.dataset + "/masked_head_neighbor.txt", "w") as file:
file.write(json.dumps(masked_head_neighbor, indent=1))

@ -1,135 +0,0 @@

@ -1,135 +0,0 @@
idea_or_concept idea or concept
virus virus
spatial_concept spatial concept
human_caused_phenomenon_or_process human caused phenomenon or process
human human
organ_or_tissue_function organ or tissue function
daily_or_recreational_activity daily or recreational activity
steroid steroid
biomedical_or_dental_material biomedical or dental material
vertebrate vertebrate
immunologic_factor immunologic factor
inorganic_chemical inorganic chemical
invertebrate invertebrate
embryonic_structure embryonic structure
functional_concept functional concept
amino_acid_peptide_or_protein amino acid peptide or protein
fish fish
reptile reptile
physical_object physical object
disease_or_syndrome disease or syndrome
biologically_active_substance biologically active substance
physiologic_function physiologic function
population_group population group
group group
body_space_or_junction body space or junction
bird bird
qualitative_concept qualitative concept
bacterium bacterium
cell_function cell function
enzyme enzyme
organophosphorus_compound organophosphorus compound
nucleic_acid_nucleoside_or_nucleotide nucleic acid nucleoside or nucleotide
cell cell
language language
antibiotic antibiotic
indicator_reagent_or_diagnostic_aid indicator reagent or diagnostic aid
fungus fungus
chemical_viewed_functionally chemical viewed functionally
rickettsia_or_chlamydia rickettsia or chlamydia
patient_or_disabled_group patient or disabled group
professional_society professional society
health_care_related_organization health care related organization
clinical_attribute clinical attribute
biomedical_occupation_or_discipline biomedical occupation or discipline
temporal_concept temporal concept
phenomenon_or_process phenomenon or process
family_group family group
chemical_viewed_structurally chemical viewed structurally
regulation_or_law regulation or law
acquired_abnormality acquired abnormality
experimental_model_of_disease experimental model of disease
professional_or_occupational_group professional or occupational group
injury_or_poisoning injury or poisoning
receptor receptor
drug_delivery_device drug delivery device
hazardous_or_poisonous_substance hazardous or poisonous substance
organism organism
neoplastic_process neoplastic process
mammal mammal
molecular_function molecular function
lipid lipid
group_attribute group attribute
nucleotide_sequence nucleotide sequence
biologic_function biologic function
chemical chemical
cell_component cell component
intellectual_product intellectual product
manufactured_object manufactured object
classification classification
geographic_area geographic area
vitamin vitamin
gene_or_genome gene or genome
self_help_or_relief_organization self help or relief organization
pathologic_function pathologic function
amphibian amphibian
laboratory_or_test_result laboratory or test result
organism_attribute organism attribute
cell_or_molecular_dysfunction cell or molecular dysfunction
therapeutic_or_preventive_procedure therapeutic or preventive procedure
sign_or_symptom sign or symptom
occupational_activity occupational activity
anatomical_abnormality anatomical abnormality
hormone hormone
fully_formed_anatomical_structure fully formed anatomical structure
educational_activity educational activity
quantitative_concept quantitative concept
tissue tissue
organism_function organism function
social_behavior social behavior
mental_or_behavioral_dysfunction mental or behavioral dysfunction
governmental_or_regulatory_activity governmental or regulatory activity
molecular_biology_research_technique molecular biology research technique
occupation_or_discipline occupation or discipline
conceptual_entity conceptual entity
body_location_or_region body location or region
pharmacologic_substance pharmacologic substance
clinical_drug clinical drug
food food
substance substance
genetic_function genetic function
congenital_abnormality congenital abnormality
medical_device medical device
carbohydrate carbohydrate
health_care_activity health care activity
eicosanoid eicosanoid
element_ion_or_isotope element ion or isotope
diagnostic_procedure diagnostic procedure
entity entity
event event
laboratory_procedure laboratory procedure
environmental_effect_of_humans environmental effect of humans
body_part_organ_or_organ_component body part organ or organ component
molecular_sequence molecular sequence
mental_process mental process
research_device research device
alga alga
natural_phenomenon_or_process natural phenomenon or process
anatomical_structure anatomical structure
animal animal
body_system body system
behavior behavior
carbohydrate_sequence carbohydrate sequence
archaeon archaeon
research_activity research activity
organization organization
individual_behavior individual behavior
organic_chemical organic chemical
finding finding
age_group age group
activity activity
machine_activity machine activity
plant plant
body_substance body substance
amino_acid_sequence amino acid sequence
neuroreactive_substance_or_biogenic_amine neuroreactive substance or biogenic amine

@ -1,155 +0,0 @@
"cells": [
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"path1 = './entities.txt'\n",
"path2 = './relations.txt'\n",
"path3 = './train.tsv'\n",
"path4 = './dev.tsv'\n",
"path5 = './test.tsv'\n",
"path6 = './get_neighbor/entity2id.txt'\n",
"path7 = './get_neighbor/relation2id.txt'\n",
"path8 = './get_neighbor/train2id.txt'\n",
"path9 = './get_neighbor/valid2id.txt'\n",
"path10 = './get_neighbor/test2id.txt'"
"cell_type": "code",
"execution_count": 7,
"metadata": {},
