Relphormer baseline

2022-12-26 04:54:46 +00:00
commit c0d0be076f
117 changed files with 1574716 additions and 0 deletions
@@ -0,0 +1,2 @@
+from .data_module import KGC
+from .processor import convert_examples_to_features, KGProcessor
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import cython
+from cython.parallel cimport prange, parallel
+cimport numpy
+import numpy
+
+def floyd_warshall(adjacency_matrix):
+
+    (nrows, ncols) = adjacency_matrix.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+
+    adj_mat_copy = adjacency_matrix.astype(long, order='C', casting='safe', copy=True)
+    assert adj_mat_copy.flags['C_CONTIGUOUS']
+    cdef numpy.ndarray[long, ndim=2, mode='c'] M = adj_mat_copy
+    cdef numpy.ndarray[long, ndim=2, mode='c'] path = numpy.zeros([n, n], dtype=numpy.int64)
+
+    cdef unsigned int i, j, k
+    cdef long M_ij, M_ik, cost_ikkj
+    cdef long* M_ptr = &M[0,0]
+    cdef long* M_i_ptr
+    cdef long* M_k_ptr
+
+    # set unreachable nodes distance to 510
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                M[i][j] = 0
+            elif M[i][j] == 0:
+                M[i][j] = 510
+
+    # floyed algo
+    for k in range(n):
+        M_k_ptr = M_ptr + n*k
+        for i in range(n):
+            M_i_ptr = M_ptr + n*i
+            M_ik = M_i_ptr[k]
+            for j in range(n):
+                cost_ikkj = M_ik + M_k_ptr[j]
+                M_ij = M_i_ptr[j]
+                if M_ij > cost_ikkj:
+                    M_i_ptr[j] = cost_ikkj
+                    path[i][j] = k
+
+    # set unreachable path to 510
+    for i in range(n):
+        for j in range(n):
+            if M[i][j] >= 510:
+                path[i][j] = 510
+                M[i][j] = 510
+
+    return M, path
+
+
+def get_all_edges(path, i, j):
+    cdef unsigned int k = path[i][j]
+    if k == 0:
+        return []
+    else:
+        return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
+
@@ -0,0 +1,71 @@
+"""Base DataModule class."""
+from pathlib import Path
+from typing import Dict
+import argparse
+import os
+
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+
+
+class Config(dict):
+    def __getattr__(self, name):
+        return self.get(name)
+
+    def __setattr__(self, name, val):
+        self[name] = val
+
+
+BATCH_SIZE = 8
+NUM_WORKERS = 8
+
+
+class BaseDataModule(pl.LightningDataModule):
+    """
+    Base DataModule.
+    Learn more at https://pytorch-lightning.readthedocs.io/en/stable/datamodules.html
+    """
+
+    def __init__(self, args: argparse.Namespace = None) -> None:
+        super().__init__()
+        self.args = Config(vars(args)) if args is not None else {}
+        self.batch_size = self.args.get("batch_size", BATCH_SIZE)
+        self.num_workers = self.args.get("num_workers", NUM_WORKERS)
+
+
+    @staticmethod
+    def add_to_argparse(parser):
+        parser.add_argument(
+            "--batch_size", type=int, default=BATCH_SIZE, help="Number of examples to operate on per forward step."
+        )
+        parser.add_argument(
+            "--num_workers", type=int, default=0, help="Number of additional processes to load data."
+        )
+        parser.add_argument(
+            "--dataset", type=str, default="./dataset/NELL", help="Number of additional processes to load data."
+        )
+        return parser
+
+    def prepare_data(self):
+        """
+        Use this method to do things that might write to disk or that need to be done only from a single GPU in distributed settings (so don't set state `self.x = y`).
+        """
+        pass
+
+    def setup(self, stage=None):
+        """
+        Split into train, val, test, and set dims.
+        Should assign `torch Dataset` objects to self.data_train, self.data_val, and optionally self.data_test.
+        """
+        self.data_train = None
+        self.data_val = None
+        self.data_test = None
+
+    def train_dataloader(self):
+        return DataLoader(self.data_train, shuffle=True, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
+
+    def val_dataloader(self):
+        return DataLoader(self.data_val, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
+
+    def test_dataloader(self):
+        return DataLoader(self.data_test, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
@@ -0,0 +1,195 @@
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+from enum import Enum
+import torch
+
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer, BertTokenizer
+# from transformers.configuration_bert import BertTokenizer, BertTokenizerFast
+from transformers.tokenization_utils_base import (BatchEncoding,
+                                                  PreTrainedTokenizerBase)
+
+from .base_data_module import BaseDataModule
+from .processor import KGProcessor, get_dataset
+import transformers
+transformers.logging.set_verbosity_error()
+
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
+
+class PaddingStrategy(ExplicitEnum):
+    """
+    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
+    in an IDE.
+    """
+
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+
+import numpy as np
+
+@dataclass
+class DataCollatorForSeq2Seq:
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        model (:class:`~transformers.PreTrainedModel`):
+            The model that is being trained. If set and has the `prepare_decoder_input_ids_from_labels`, use it to
+            prepare the `decoder_input_ids`
+
+            This is useful when using `label_smoothing` to avoid calculating loss twice.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    model: Optional[Any] = None
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    return_tensors: str = "pt"
+    num_labels: int = 0
+
+    def __call__(self, features, return_tensors=None):
+
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        labels = [feature.pop("labels") for feature in features] if "labels" in features[0].keys() else None
+        label = [feature.pop("label") for feature in features]
+        features_keys = {}
+        name_keys = list(features[0].keys())
+        for k in name_keys:
+            # ignore the padding arguments
+            if k in ["input_ids", "attention_mask", "token_type_ids"]: continue
+            try:
+                features_keys[k] = [feature.pop(k) for feature in features]
+            except KeyError:
+                continue
+
+        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
+        # same length to return tensors.
+        bsz = len(labels)
+        with torch.no_grad():
+            new_labels = torch.zeros(bsz, self.num_labels)
+            for i,l in enumerate(labels):
+                if isinstance(l, int): 
+                    new_labels[i][l] = 1
+                else:
+                    for j in l:
+                        new_labels[i][j] = 1
+            labels = new_labels
+
+        features = self.tokenizer.pad(
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=return_tensors,
+        )
+        features['labels'] = labels
+        features['label'] = torch.tensor(label)
+        features.update(features_keys)
+
+        return features
+
+
+
+class KGC(BaseDataModule):
+    def __init__(self, args, model) -> None:
+        super().__init__(args)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.args.model_name_or_path, use_fast=False)
+        self.processor = KGProcessor(self.tokenizer, args)
+        self.label_list = self.processor.get_labels(args.data_dir)
+
+        entity_list = self.processor.get_entities(args.data_dir)
+        
+        num_added_tokens = self.tokenizer.add_special_tokens({'additional_special_tokens': entity_list})
+        self.sampler = DataCollatorForSeq2Seq(self.tokenizer,
+            model=model,
+            label_pad_token_id=self.tokenizer.pad_token_id,
+            pad_to_multiple_of=8 if self.args.precision == 16 else None,
+            padding="longest",
+            max_length=self.args.max_seq_length,
+            num_labels = len(entity_list),
+        )
+        relations_tokens = self.processor.get_relations(args.data_dir)
+        self.num_relations = len(relations_tokens)
+        num_added_tokens = self.tokenizer.add_special_tokens({'additional_special_tokens': relations_tokens})
+
+        vocab = self.tokenizer.get_added_vocab()
+        self.relation_id_st = vocab[relations_tokens[0]]
+        self.relation_id_ed = vocab[relations_tokens[-1]] + 1
+        self.entity_id_st = vocab[entity_list[0]]
+        self.entity_id_ed = vocab[entity_list[-1]] + 1
+
+
+    def setup(self, stage=None):
+        self.data_train = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "train")
+        self.data_val = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "dev")
+        self.data_test = get_dataset(self.args, self.processor, self.label_list, self.tokenizer, "test")
+
+    def prepare_data(self):
+        pass
+
+    def get_config(self):
+        d = {}
+        for k, v in self.__dict__.items():
+            if "st" in k or "ed" in k:
+                d.update({k:v})
+        
+        return d
+
+
+    @staticmethod
+    def add_to_argparse(parser):
+        BaseDataModule.add_to_argparse(parser)
+        parser.add_argument("--model_name_or_path", type=str, default="roberta-base", help="the name or the path to the pretrained model")
+        parser.add_argument("--data_dir", type=str, default="roberta-base", help="the name or the path to the pretrained model")
+        parser.add_argument("--max_seq_length", type=int, default=256, help="Number of examples to operate on per forward step.")
+        parser.add_argument("--warm_up_radio", type=float, default=0.1, help="Number of examples to operate on per forward step.")
+        parser.add_argument("--eval_batch_size", type=int, default=8)
+        parser.add_argument("--overwrite_cache", action="store_true", default=False)
+        return parser
+
+    def get_tokenizer(self):
+        return self.tokenizer
+
+    def train_dataloader(self):
+        return DataLoader(self.data_train, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.batch_size, shuffle=not self.args.faiss_init)
+
+    def val_dataloader(self):
+        return DataLoader(self.data_val, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.eval_batch_size)
+
+    def test_dataloader(self):
+        return DataLoader(self.data_test, num_workers=self.num_workers, pin_memory=True, collate_fn=self.sampler, batch_size=self.args.eval_batch_size)
+
@@ -0,0 +1,954 @@
+from hashlib import new
+from re import DEBUG
+
+import contextlib
+import sys
+
+from collections import Counter
+from multiprocessing import Pool
+from torch._C import HOIST_CONV_PACKED_PARAMS
+from torch.utils.data import Dataset, Sampler, IterableDataset
+from collections import defaultdict
+from functools import partial
+from multiprocessing import Pool
+import os
+import random
+import json
+import torch
+import copy
+import numpy as np
+import pickle
+from tqdm import tqdm
+from dataclasses import dataclass, asdict, replace
+import inspect
+
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+
+from models.utils import get_entity_spans_pre_processing
+import pyximport
+
+pyximport.install(setup_args={'include_dirs': np.get_include()})
+import data.algos as algos
+
+def lmap(a, b):
+    return list(map(a,b))  # a是个函数，b是个值列表，返回函数值列表
+
+def cache_results(_cache_fp, _refresh=False, _verbose=1):
+    r"""
+    cache_results是fastNLP中用于cache数据的装饰器。通过下面的例子看一下如何使用::
+
+        import time
+        import numpy as np
+        from fastNLP import cache_results
+        
+        @cache_results('cache.pkl')
+        def process_data():
+            # 一些比较耗时的工作，比如读取数据，预处理数据等，这里用time.sleep()代替耗时
+            time.sleep(1)
+            return np.random.randint(10, size=(5,))
+        
+        start_time = time.time()
+        print("res =",process_data())
+        print(time.time() - start_time)
+        
+        start_time = time.time()
+        print("res =",process_data())
+        print(time.time() - start_time)
+        
+        # 输出内容如下，可以看到两次结果相同，且第二次几乎没有花费时间
+        # Save cache to cache.pkl.
+        # res = [5 4 9 1 8]
+        # 1.0042750835418701
+        # Read cache from cache.pkl.
+        # res = [5 4 9 1 8]
+        # 0.0040721893310546875
+
+    可以看到第二次运行的时候，只用了0.0001s左右，是由于第二次运行将直接从cache.pkl这个文件读取数据，而不会经过再次预处理::
+
+        # 还是以上面的例子为例，如果需要重新生成另一个cache，比如另一个数据集的内容，通过如下的方式调用即可
+        process_data(_cache_fp='cache2.pkl')  # 完全不影响之前的‘cache.pkl'
+
+    上面的_cache_fp是cache_results会识别的参数，它将从'cache2.pkl'这里缓存/读取数据，即这里的'cache2.pkl'覆盖默认的
+    'cache.pkl'。如果在你的函数前面加上了@cache_results()则你的函数会增加三个参数[_cache_fp, _refresh, _verbose]。
+    上面的例子即为使用_cache_fp的情况，这三个参数不会传入到你的函数中，当然你写的函数参数名也不可能包含这三个名称::
+
+        process_data(_cache_fp='cache2.pkl', _refresh=True)  # 这里强制重新生成一份对预处理的cache。
+        #  _verbose是用于控制输出信息的，如果为0,则不输出任何内容;如果为1,则会提醒当前步骤是读取的cache还是生成了新的cache
+
+    :param str _cache_fp: 将返回结果缓存到什么位置;或从什么位置读取缓存。如果为None，cache_results没有任何效用，除非在
+        函数调用的时候传入_cache_fp这个参数。
+    :param bool _refresh: 是否重新生成cache。
+    :param int _verbose: 是否打印cache的信息。
+    :return:
+    """
+
+    def wrapper_(func):
+        signature = inspect.signature(func)
+        for key, _ in signature.parameters.items():
+            if key in ('_cache_fp', '_refresh', '_verbose'):
+                raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key))
+
+        def wrapper(*args, **kwargs):
+            my_args = args[0]
+            mode = args[-1]
+            if '_cache_fp' in kwargs:
+                cache_filepath = kwargs.pop('_cache_fp')
+                assert isinstance(cache_filepath, str), "_cache_fp can only be str."
+            else:
+                cache_filepath = _cache_fp
+            if '_refresh' in kwargs:
+                refresh = kwargs.pop('_refresh')
+                assert isinstance(refresh, bool), "_refresh can only be bool."
+            else:
+                refresh = _refresh
+            if '_verbose' in kwargs:
+                verbose = kwargs.pop('_verbose')
+                assert isinstance(verbose, int), "_verbose can only be integer."
+            else:
+                verbose = _verbose
+            refresh_flag = True
+            
+            model_name = my_args.model_name_or_path.split("/")[-1]
+            is_pretrain = my_args.pretrain
+            cache_filepath = os.path.join(my_args.data_dir, f"cached_{mode}_features{model_name}_pretrain{is_pretrain}_faiss{my_args.faiss_init}_seqlength{my_args.max_seq_length}_{my_args.litmodel_class}.pkl")
+            refresh = my_args.overwrite_cache
+
+            if cache_filepath is not None and refresh is False:
+                # load data
+                if os.path.exists(cache_filepath):
+                    with open(cache_filepath, 'rb') as f:
+                        results = pickle.load(f)
+                    if verbose == 1:
+                        logger.info("Read cache from {}.".format(cache_filepath))
+                    refresh_flag = False
+
+            if refresh_flag:
+                results = func(*args, **kwargs)
+                if cache_filepath is not None:
+                    if results is None:
+                        raise RuntimeError("The return value is None. Delete the decorator.")
+                    with open(cache_filepath, 'wb') as f:
+                        pickle.dump(results, f)
+                    logger.info("Save cache to {}.".format(cache_filepath))
+
+            return results
+
+        return wrapper
+
+    return wrapper_
+
+
+import argparse
+import csv
+import logging
+import os
+import random
+import sys
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+# from torch.nn import CrossEntropyLoss, MSELoss
+# from scipy.stats import pearsonr, spearmanr
+# from sklearn.metrics import matthews_corrcoef, f1_scoreclass 
+
+
+
+logger = logging.getLogger(__name__)
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, text_c=None, text_d=None, label=None, real_label=None, en=None, en_id=None, rel=None, text_d_id=None, graph_inf=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            text_c: (Optional) string. The untokenized text of the third sequence.
+            Only must be specified for sequence triple tasks.
+            label: (Optional) string. list of entities
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.text_c = text_c
+        self.text_d = text_d
+        self.label = label
+        self.real_label = real_label
+        self.en = en
+        self.rel = rel # rel id
+        self.text_d_id = text_d_id
+        self.graph_inf = graph_inf
+        self.en_id = en_id
+
+
+@dataclass
+class InputFeatures:
+    """A single set of features of data."""
+
+    input_ids: torch.Tensor
+    attention_mask: torch.Tensor
+    labels: torch.Tensor = None
+    label: torch.Tensor = None
+    en: torch.Tensor = 0
+    rel: torch.Tensor = 0
+    pos: torch.Tensor = 0
+    graph: torch.Tensor = 0
+    distance_attention: torch.Tensor = 0
+    # attention_bias: torch.Tensor = 0
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self, data_dir):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+import copy
+
+
+def solve_get_knowledge_store(line, set_type="train", pretrain=1):
+    """
+    use the LM to get the entity embedding.
+    Transductive: triples + text description
+    Inductive: text description
+    
+    """
+    examples = []
+        
+    head_ent_text = ent2text[line[0]]
+    tail_ent_text = ent2text[line[2]]
+    relation_text = rel2text[line[1]]
+    
+    i=0
+    
+    a = tail_filter_entities["\t".join([line[0],line[1]])]
+    b = head_filter_entities["\t".join([line[2],line[1]])]
+    
+    guid = "%s-%s" % (set_type, i)
+    text_a = head_ent_text
+    text_b = relation_text
+    text_c = tail_ent_text 
+
+    # use the description of c to predict A
+    examples.append(
+        InputExample(guid=guid, text_a="[PAD]", text_b=text_b + "[PAD]", text_c = "[PAD]" + " " + text_c, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[ent2id[line[0]], rel2id[line[1]], ent2id[line[2]]], rel=0)
+    )
+    examples.append(
+        InputExample(guid=guid, text_a="[PAD]", text_b=text_b + "[PAD]", text_c = "[PAD]" + " " + text_a, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[2]], en=[ent2id[line[0]], rel2id[line[1]], ent2id[line[2]]], rel=0)
+    )
+    return examples
+
+
+def solve(line,  set_type="train", pretrain=1, max_triplet=32):
+    examples = []
+        
+    head_ent_text = ent2text[line[0]]
+    tail_ent_text = ent2text[line[2]]
+    relation_text = rel2text[line[1]]
+    
+    i=0
+    
+    a = tail_filter_entities["\t".join([line[0],line[1]])]
+    b = head_filter_entities["\t".join([line[2],line[1]])]
+    
+    guid = "%s-%s" % (set_type, i)
+    text_a = head_ent_text
+    text_b = relation_text
+    text_c = tail_ent_text
+
+    
+    if pretrain:
+        text_a_tokens = text_a.split()
+        for i in range(10):
+            st = random.randint(0, len(text_a_tokens))
+            examples.append(
+                InputExample(guid=guid, text_a="[MASK]", text_b=" ".join(text_a_tokens[st:min(st+64, len(text_a_tokens))]), text_c = "", label=ent2id[line[0]], real_label=ent2id[line[0]], en=0, rel=0)
+            )
+        examples.append(
+            InputExample(guid=guid, text_a="[MASK]", text_b=text_a, text_c = "", label=ent2id[line[0]], real_label=ent2id[line[0]], en=0, rel=0)
+        )
+        # examples.append(
+        #     InputExample(guid=guid, text_a="[MASK]", text_b=text_c, text_c = "", label=ent2id[line[2]], real_label=ent2id[line[2]], en=0, rel=0)
+        # )
+    else:
+        # 主要是对text_c进行包装，不再是原来的文本，而是对应子图的graph（变量graph_seq）。如果mask的是尾实体，那么就让text_c在后面加入graph_seq
+        # masked_head_seq = []
+        # masked_tail_seq = []
+        # masked_tail_graph_list = masked_tail_neighbor["\t".join([line[0],line[1]])]
+        # masked_head_graph_list = masked_head_neighbor["\t".join([line[2],line[1]])]
+        # for item in masked_head_graph_list:
+        #     masked_head_seq.append(ent2id[item[0]])
+        #     masked_head_seq.append(rel2id[item[1]])
+        #     masked_head_seq.append(ent2id[item[2]])
+
+        # for item in masked_tail_graph_list:
+        #     masked_tail_seq.append(ent2id[item[0]])
+        #     masked_tail_seq.append(rel2id[item[1]])
+        #     masked_tail_seq.append(ent2id[item[2]])
+
+        masked_head_seq = set()
+        masked_head_seq_id = set()
+        masked_tail_seq = set()
+        masked_tail_seq_id = set()
+
+        masked_tail_graph_list = masked_tail_neighbor["\t".join([line[0],line[1]])] if len(masked_tail_neighbor["\t".join([line[0],line[1]])]) < max_triplet else \
+            random.sample(masked_tail_neighbor["\t".join([line[0],line[1]])], max_triplet)
+        masked_head_graph_list = masked_head_neighbor["\t".join([line[2],line[1]])] if len(masked_head_neighbor["\t".join([line[2],line[1]])]) < max_triplet else \
+            random.sample(masked_head_neighbor["\t".join([line[2],line[1]])], max_triplet)
+        # masked_tail_graph_list = masked_tail_neighbor["\t".join([line[0],line[1]])][:16]
+        # masked_head_graph_list = masked_head_neighbor["\t".join([line[2],line[1]])][:16]
+        for item in masked_head_graph_list:
+            masked_head_seq.add(item[0])
+            masked_head_seq.add(item[1])
+            masked_head_seq.add(item[2])
+            masked_head_seq_id.add(ent2id[item[0]])
+            masked_head_seq_id.add(rel2id[item[1]])
+            masked_head_seq_id.add(ent2id[item[2]])
+
+        for item in masked_tail_graph_list:
+            masked_tail_seq.add(item[0])
+            masked_tail_seq.add(item[1])
+            masked_tail_seq.add(item[2])
+            masked_tail_seq_id.add(ent2id[item[0]])
+            masked_tail_seq_id.add(rel2id[item[1]])
+            masked_tail_seq_id.add(ent2id[item[2]])
+        # print(masked_tail_seq)
+        masked_head_seq = masked_head_seq.difference({line[0]})
+        masked_head_seq = masked_head_seq.difference({line[2]})
+        masked_head_seq = masked_head_seq.difference({line[1]})
+        masked_head_seq_id = masked_head_seq_id.difference({ent2id[line[0]]})
+        masked_head_seq_id = masked_head_seq_id.difference({rel2id[line[1]]})
+        masked_head_seq_id = masked_head_seq_id.difference({ent2id[line[2]]})
+
+        masked_tail_seq = masked_tail_seq.difference({line[0]})
+        masked_tail_seq = masked_tail_seq.difference({line[2]})
+        masked_tail_seq = masked_tail_seq.difference({line[1]})
+        masked_tail_seq_id = masked_tail_seq_id.difference({ent2id[line[0]]})
+        masked_tail_seq_id = masked_tail_seq_id.difference({rel2id[line[1]]})
+        masked_tail_seq_id = masked_tail_seq_id.difference({ent2id[line[2]]})
+        # examples.append(
+        #     InputExample(guid=guid, text_a="[MASK]", text_b=' '.join(text_b.split(' ')[:16]) + " [PAD]", text_c = "[PAD]" + " " + ' '.join(text_c.split(' ')[:16]), text_d = masked_head_seq, label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[rel2id[line[1]], ent2id[line[2]]], rel=rel2id[line[1]]))
+        # examples.append(
+        #     InputExample(guid=guid, text_a="[PAD] ", text_b=' '.join(text_b.split(' ')[:16]) + " [PAD]", text_c = "[MASK]" +" " + ' '.join(text_a.split(' ')[:16]), text_d = masked_tail_seq, label=lmap(lambda x: ent2id[x], a), real_label=ent2id[line[2]], en=[ent2id[line[0]], rel2id[line[1]]], rel=rel2id[line[1]]))
+        examples.append(
+            InputExample(guid=guid, text_a="[MASK]", text_b="[PAD]", text_c = "[PAD]", text_d = list(masked_head_seq), label=lmap(lambda x: ent2id[x], b), real_label=ent2id[line[0]], en=[line[1], line[2]], en_id = [rel2id[line[1]], ent2id[line[2]]], rel=rel2id[line[1]], text_d_id = list(masked_head_seq_id), graph_inf = masked_head_graph_list))
+        examples.append(
+            InputExample(guid=guid, text_a="[PAD]", text_b="[PAD]", text_c = "[MASK]", text_d = list(masked_tail_seq), label=lmap(lambda x: ent2id[x], a), real_label=ent2id[line[2]], en=[line[0], line[1]], en_id = [ent2id[line[0]], rel2id[line[1]]], rel=rel2id[line[1]], text_d_id = list(masked_tail_seq_id), graph_inf = masked_tail_graph_list))
+    return examples
+
+def filter_init(head, tail, t1,t2, ent2id_, ent2token_, rel2id_, masked_head_neighbor_, masked_tail_neighbor_, rel2token_):
+    global head_filter_entities
+    global tail_filter_entities
+    global ent2text
+    global rel2text
+    global ent2id
+    global ent2token
+    global rel2id
+    global masked_head_neighbor
+    global masked_tail_neighbor
+    global rel2token
+
+    head_filter_entities = head
+    tail_filter_entities = tail
+    ent2text =t1
+    rel2text =t2
+    ent2id = ent2id_
+    ent2token = ent2token_
+    rel2id = rel2id_
+    masked_head_neighbor = masked_head_neighbor_
+    masked_tail_neighbor = masked_tail_neighbor_
+    rel2token = rel2token_
+
+def delete_init(ent2text_):
+    global ent2text
+    ent2text = ent2text_
+
+
+class KGProcessor(DataProcessor):
+    """Processor for knowledge graph data set."""
+    def __init__(self, tokenizer, args):
+        self.labels = set()
+        self.tokenizer = tokenizer
+        self.args = args
+        self.entity_path = os.path.join(args.data_dir, "entity2textlong.txt") if os.path.exists(os.path.join(args.data_dir, 'entity2textlong.txt')) \
+        else os.path.join(args.data_dir, "entity2text.txt")
+    
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", data_dir, self.args)
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", data_dir, self.args)
+
+    def get_test_examples(self, data_dir, chunk=""):
+      """See base class."""
+      return self._create_examples(
+          self._read_tsv(os.path.join(data_dir, f"test{chunk}.tsv")), "test", data_dir, self.args)
+
+    def get_relations(self, data_dir):
+        """Gets all labels (relations) in the knowledge graph."""
+        # return list(self.labels)
+        with open(os.path.join(data_dir, "relations.txt"), 'r') as f:
+            lines = f.readlines()
+            relations = []
+            for line in lines:
+                relations.append(line.strip().split('\t')[0])
+        rel2token = {ent : f"[RELATION_{i}]" for i, ent in enumerate(relations)}
+        return list(rel2token.values())
+
+    def get_labels(self, data_dir):
+        """Gets all labels (0, 1) for triples in the knowledge graph."""
+        relation = []
+        with open(os.path.join(data_dir, "relation2text.txt"), 'r') as f:
+            lines = f.readlines()
+            entities = []
+            for line in lines:
+                relation.append(line.strip().split("\t")[-1])
+        return relation
+
+    def get_entities(self, data_dir):
+        """Gets all entities in the knowledge graph."""
+        with open(self.entity_path, 'r') as f:
+            lines = f.readlines()
+            entities = []
+            for line in lines:
+                entities.append(line.strip().split("\t")[0])
+        
+        ent2token = {ent : f"[ENTITY_{i}]" for i, ent in enumerate(entities)}
+        return list(ent2token.values())
+
+    def get_train_triples(self, data_dir):
+        """Gets training triples."""
+        return self._read_tsv(os.path.join(data_dir, "train.tsv"))
+
+    def get_dev_triples(self, data_dir):
+        """Gets validation triples."""
+        return self._read_tsv(os.path.join(data_dir, "dev.tsv"))
+
+    def get_test_triples(self, data_dir, chunk=""):
+        """Gets test triples."""
+        return self._read_tsv(os.path.join(data_dir, f"test{chunk}.tsv"))
+
+    def _create_examples(self, lines, set_type, data_dir, args):
+        """Creates examples for the training and dev sets."""
+        # entity to text
+        ent2text = {}
+        ent2text_with_type = {}
+        with open(self.entity_path, 'r') as f:
+            ent_lines = f.readlines()
+            for line in ent_lines:
+                temp = line.strip().split('\t')
+                try:
+                    end = temp[1]#.find(',')
+                    if "wiki" in data_dir:
+                        assert "Q" in temp[0]
+                    ent2text[temp[0]] = temp[1].replace("\\n", " ").replace("\\", "") #[:end]
+                except IndexError:
+                    # continue
+                    end = " "#.find(',')
+                    if "wiki" in data_dir:
+                        assert "Q" in temp[0]
+                    ent2text[temp[0]] = end #[:end]
+  
+        entities = list(ent2text.keys())
+        ent2token = {ent : f"[ENTITY_{i}]" for i, ent in enumerate(entities)}
+        ent2id = {ent : i for i, ent in enumerate(entities)}
+        
+        rel2text = {}
+        with open(os.path.join(data_dir, "relation2text.txt"), 'r') as f:
+            rel_lines = f.readlines()
+            for line in rel_lines:
+                temp = line.strip().split('\t')
+                rel2text[temp[0]] = temp[1]      
+
+        relation_names = {}
+        with open(os.path.join(data_dir, "relations.txt"), "r") as file:
+            for line in file.readlines():
+                t = line.strip()
+                relation_names[t] = rel2text[t]
+
+        tmp_lines = []
+        not_in_text = 0
+        for line in tqdm(lines, desc="delete entities without text name."):
+            if (line[0] not in ent2text) or (line[2] not in ent2text) or (line[1] not in rel2text):
+                not_in_text += 1
+                continue
+            tmp_lines.append(line)
+        lines = tmp_lines
+        print(f"total entity not in text : {not_in_text} ")
+
+        relations = list(rel2text.keys())
+        rel2token = {rel : f"[RELATION_{i}]" for i, rel in enumerate(relations)}
+        # rel id -> relation token id
+        num_entities = len(self.get_entities(args.data_dir))
+        rel2id = {w:i+num_entities for i,w in enumerate(relation_names.keys())}
+
+
+        with open(os.path.join(data_dir, "masked_head_neighbor.txt"), 'r') as file:
+            masked_head_neighbor = json.load(file)
+
+        with open(os.path.join(data_dir, "masked_tail_neighbor.txt"), 'r') as file:
+            masked_tail_neighbor = json.load(file)
+        
+        examples = []
+        # head filter head entity
+        head_filter_entities = defaultdict(list)
+        tail_filter_entities = defaultdict(list)
+
+        dataset_list = ["train.tsv", "dev.tsv", "test.tsv"]
+        # in training, only use the train triples
+        if set_type == "train" and not args.pretrain: dataset_list = dataset_list[0:1]
+        for m in dataset_list:
+            with open(os.path.join(data_dir, m), 'r') as file:
+                train_lines = file.readlines()
+                for idx in range(len(train_lines)):
+                    train_lines[idx] = train_lines[idx].strip().split("\t")
+
+            for line in train_lines:
+                tail_filter_entities["\t".join([line[0], line[1]])].append(line[2])
+                head_filter_entities["\t".join([line[2], line[1]])].append(line[0])
+        
+        max_head_entities = max(len(_) for _ in head_filter_entities.values())
+        max_tail_entities = max(len(_) for _ in tail_filter_entities.values())
+
+        # use bce loss, ignore the mlm
+        if set_type == "train" and args.bce:
+            lines = []
+            for k, v in tail_filter_entities.items():
+                h, r = k.split('\t')
+                t = v[0]
+                lines.append([h, r, t])
+            for k, v in head_filter_entities.items():
+                t, r = k.split('\t')
+                h = v[0]
+                lines.append([h, r, t])
+        
+
+        # for training , select each entity as for get mask embedding.
+        if args.pretrain:
+            rel = list(rel2text.keys())[0]
+            lines = []
+            for k in ent2text.keys():
+                lines.append([k, rel, k])
+        
+        print(f"max number of filter entities : {max_head_entities} {max_tail_entities}")
+        # 把子图信息加入到filter_init中（初始化为文件夹，及固定子图），设置为全局变量，solve中调用
+        from os import cpu_count
+        threads = min(1, cpu_count())
+        filter_init(head_filter_entities, tail_filter_entities,ent2text, rel2text, ent2id, ent2token, rel2id, masked_head_neighbor, masked_tail_neighbor, rel2token
+            )
+        
+        if hasattr(args, "faiss_init") and args.faiss_init:
+            annotate_ = partial(
+                solve_get_knowledge_store,
+                pretrain=self.args.pretrain
+            )
+        else:
+            annotate_ = partial(
+                solve,
+                pretrain=self.args.pretrain,
+                max_triplet=self.args.max_triplet
+            )
+        examples = list(
+            tqdm(
+                map(annotate_, lines),
+                total=len(lines),
+                desc="convert text to examples"
+            )
+        )
+
+        tmp_examples = []
+        for e in examples:
+            for ee in e:
+                tmp_examples.append(ee)
+        examples = tmp_examples
+        # delete vars
+        del head_filter_entities, tail_filter_entities, ent2text, rel2text, ent2id, ent2token, rel2id
+        return examples
+
+class Verbalizer(object):
+    def __init__(self, args):
+        if "WN18RR" in args.data_dir:
+            self.mode = "WN18RR"
+        elif "FB15k" in args.data_dir:
+            self.mode = "FB15k"
+        elif "umls" in args.data_dir:
+            self.mode = "umls"
+        elif "codexs" in args.data_dir:
+            self.mode = "codexs"
+        elif "FB13" in args.data_dir:
+            self.mode = "FB13"
+        elif "WN11" in args.data_dir:
+            self.mode = "WN11"
+        
+    
+    def _convert(self, head, relation, tail):
+        if self.mode == "umls":
+            return f"The {relation} {head} is "
+        
+        return f"{head} {relation}"
+
+
+class KGCDataset(Dataset):
+    def __init__(self, features):
+        self.features = features
+
+    def __getitem__(self, index):
+        return self.features[index]
+    
+    def __len__(self):
+        return len(self.features)
+
+def convert_examples_to_features_init(tokenizer_for_convert):
+    global tokenizer
+    tokenizer = tokenizer_for_convert
+
+def convert_examples_to_features(example, max_seq_length, mode, pretrain=1):
+    """Loads a data file into a list of `InputBatch`s."""
+    text_a = " ".join(example.text_a.split()[:128])
+    text_b = " ".join(example.text_b.split()[:128])
+    text_c = " ".join(example.text_c.split()[:128])
+    
+    if pretrain:
+        input_text_a = text_a
+        input_text_b = text_b
+    else:
+        input_text_a = " ".join([text_a, text_b])
+        input_text_b = text_c
+    
+
+    inputs = tokenizer(
+        input_text_a,
+        input_text_b,
+        truncation="longest_first",
+        max_length=max_seq_length,
+        padding="longest",
+        add_special_tokens=True,
+    )
+    # assert tokenizer.mask_token_id in inputs.input_ids, "mask token must in input"
+
+    features = asdict(InputFeatures(input_ids=inputs["input_ids"],
+                            attention_mask=inputs['attention_mask'],
+                            labels=torch.tensor(example.label),
+                            label=torch.tensor(example.real_label)
+        )
+    )
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+def _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length):
+    """Truncates a sequence triple in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b) + len(tokens_c)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b) and len(tokens_a) > len(tokens_c):
+            tokens_a.pop()
+        elif len(tokens_b) > len(tokens_a) and len(tokens_b) > len(tokens_c):
+            tokens_b.pop()
+        elif len(tokens_c) > len(tokens_a) and len(tokens_c) > len(tokens_b):
+            tokens_c.pop()
+        else:
+            tokens_c.pop()
+
+
+@cache_results(_cache_fp="./dataset")
+def get_dataset(args, processor, label_list, tokenizer, mode):
+
+    assert mode in ["train", "dev", "test"], "mode must be in train dev test!"
+
+    # use training data to construct the entity embedding
+    combine_train_and_test = False
+    if args.faiss_init and mode == "test" and not args.pretrain:
+        mode = "train"
+        if "ind" in args.data_dir: combine_train_and_test = True
+    else:
+        pass
+
+    if mode == "train":
+        train_examples = processor.get_train_examples(args.data_dir)
+    elif mode == "dev":
+        train_examples = processor.get_dev_examples(args.data_dir)
+    else:
+        train_examples = processor.get_test_examples(args.data_dir)
+    
+    if combine_train_and_test:
+        logger.info("use all the dataset for getting the entity mask embedding in pretraining pretraining")
+        logger.info("use all the dataset for getting the entity mask embedding in pretraining pretraining")
+        train_examples = processor.get_test_examples(args.data_dir) + processor.get_train_examples(args.data_dir) + processor.get_dev_examples(args.data_dir)
+
+    from os import cpu_count
+    with open(os.path.join(args.data_dir, f"examples_{mode}.txt"), 'w') as file:
+        for line in train_examples:
+            d = {}
+            d.update(line.__dict__)
+            file.write(json.dumps(d) + '\n')
+
+    # 这里应该不需要重新from_pretrain，必须沿用加入token的
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=False)
+
+    features = []
+
+    file_inputs = [os.path.join(args.data_dir, f"examples_{mode}.txt")]
+    file_outputs = [os.path.join(args.data_dir, f"features_{mode}.txt")]
+
+    with contextlib.ExitStack() as stack:
+        inputs = [
+            stack.enter_context(open(input, "r", encoding="utf-8"))
+            if input != "-" else sys.stdin
+            for input in file_inputs
+        ]
+        outputs = [
+            stack.enter_context(open(output, "w", encoding="utf-8"))
+            if output != "-" else sys.stdout
+            for output in file_outputs
+        ]
+
+        encoder = MultiprocessingEncoder(tokenizer, args)
+        pool = Pool(16, initializer=encoder.initializer)
+        encoder.initializer()
+        encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 1000)
+        # encoded_lines = map(encoder.encode_lines, zip(*inputs))
+
+        stats = Counter()
+        for i, (filt, enc_lines) in tqdm(enumerate(encoded_lines, start=1), total=len(train_examples)):
+            if filt == "PASS":
+                for enc_line, output_h in zip(enc_lines, outputs):
+                    features.append(eval(enc_line))
+                    # features.append(enc_line)
+                    # print(enc_line, file=output_h)
+            else:
+                stats["num_filtered_" + filt] += 1
+
+        for k, v in stats.most_common():
+            print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
+
+    for f_id, f in enumerate(features):
+        en = features[f_id].pop("en")
+        rel = features[f_id].pop("rel")
+        graph = features[f_id].pop("graph")
+        real_label = f['label']
+        features[f_id]['distance_attention'] = torch.Tensor(features[f_id]['distance_attention'])
+        
+        cnt = 0
+        cnt_2 = 0
+        if not isinstance(en, list): break
+
+        pos = 0
+        for i,t in enumerate(f['input_ids']):
+            if t == tokenizer.pad_token_id:
+                features[f_id]['input_ids'][i] = en[cnt] + len(tokenizer)
+                cnt += 1
+            if t == tokenizer.unk_token_id:
+                features[f_id]['input_ids'][i] = graph[cnt_2] + len(tokenizer)
+                cnt_2 += 1
+            if features[f_id]['input_ids'][i] == real_label + len(tokenizer):
+                pos = i
+            if cnt_2 == len(graph) and cnt == len(en): break
+            # 如果等于UNK， pop出图节点list，然后替换
+        assert not (args.faiss_init and pos == 0)
+        features[f_id]['pos'] = pos
+
+        # for i,t in enumerate(f['input_ids']):
+        #     if t == tokenizer.pad_token_id:
+        #         features[f_id]['input_ids'][i] = rel + len(tokenizer) + num_entities
+        #         break
+
+        
+
+    features = KGCDataset(features)
+    return features
+
+
+class MultiprocessingEncoder(object):
+    def __init__(self, tokenizer, args):
+        self.tokenizer = tokenizer
+        self.pretrain = args.pretrain
+        self.max_seq_length = args.max_seq_length
+
+    def initializer(self):
+        global bpe
+        bpe = self.tokenizer
+
+    def encode(self, line):
+        global bpe
+        ids = bpe.encode(line)
+        return list(map(str, ids))
+
+    def decode(self, tokens):
+        global bpe
+        return bpe.decode(tokens)
+
+    def encode_lines(self, lines):
+        """
+        Encode a set of lines. All lines will be encoded together.
+        """
+        enc_lines = []
+        for line in lines:
+            line = line.strip()
+            if len(line) == 0:
+                return ["EMPTY", None]
+            # enc_lines.append(" ".join(tokens))
+            enc_lines.append(json.dumps(self.convert_examples_to_features(example=eval(line))))
+            # enc_lines.append(" ")
+            # enc_lines.append("123")
+        return ["PASS", enc_lines]
+
+    def decode_lines(self, lines):
+        dec_lines = []
+        for line in lines:
+            tokens = map(int, line.strip().split())
+            dec_lines.append(self.decode(tokens))
+        return ["PASS", dec_lines]
+
+    def convert_examples_to_features(self, example):
+        pretrain = self.pretrain
+        max_seq_length = self.max_seq_length
+        global bpe
+        """Loads a data file into a list of `InputBatch`s."""
+        # tokens_a = tokenizer.tokenize(example.text_a)
+        # tokens_b = tokenizer.tokenize(example.text_b)
+        # tokens_c = tokenizer.tokenize(example.text_c)
+
+        # _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length= max_seq_length)
+        # text_a = " ".join(example['text_a'].split()[:128])
+        # text_b = " ".join(example['text_b'].split()[:128])
+        # text_c = " ".join(example['text_c'].split()[:128])
+        
+        text_a = example['text_a']
+        text_b = example['text_b']
+        text_c = example['text_c']
+        text_d = example['text_d']
+        graph_list = example['graph_inf']
+
+        if pretrain:
+            # the des of xxx is [MASK] .
+            input_text = f"The description of {text_a} is that {text_b} ."
+            inputs = bpe(
+                input_text,
+                truncation="longest_first",
+                max_length=max_seq_length,
+                padding="longest",
+                add_special_tokens=True,
+            )
+        else:
+            if text_a == "[MASK]":
+                input_text_a = " ".join([text_a, text_b])
+                input_text_b = text_c
+                origin_triplet = ["MASK"] + example['en']
+                graph_seq = ["MASK"] + example['en'] + text_d
+            else:
+                input_text_a = text_a
+                input_text_b = " ".join([text_b, text_c])
+                origin_triplet = example['en'] + ["MASK"]
+                graph_seq = example['en'] + ["MASK"] + text_d
+            # 加入graph信息, 拼接等量[UNK]
+            input_text_b = " ".join(["[CLS]", input_text_a, input_text_b, bpe.unk_token * len(text_d)])
+
+            inputs = bpe(
+                input_text_b,
+                truncation="longest_first",
+                max_length=max_seq_length,
+                padding="longest",
+                add_special_tokens=False,
+            )
+        # assert bpe.mask_token_id in inputs.input_ids, "mask token must in input"
+
+        # graph_seq = input_text_b[] 把图结构信息读取出来
+        # [CLS] [ENTITY_13258] [RELATION_68] [MASK] [ENTITY_4] [RELATION_127] [ENTITY_8] [RELATION_9] [ENTITY_9011] [ENTITY_12477] [PAD] [PAD]
+        # 获取图结构信息
+        # 首先在solve中加入一个存储所有子图三元组的临时存储变量
+        # 在这里graph_information = example['graph']
+        new_rel = set()
+        new_rel.add(tuple((origin_triplet[0], origin_triplet[1])))
+        new_rel.add(tuple((origin_triplet[1], origin_triplet[0])))
+        new_rel.add(tuple((origin_triplet[1], origin_triplet[2])))
+        new_rel.add(tuple((origin_triplet[2], origin_triplet[1])))
+        for triplet in graph_list:
+            rel1, rel2, rel3, rel4 = tuple((triplet[0], triplet[1])), tuple((triplet[1], triplet[2])), tuple((triplet[1], triplet[0])), tuple((triplet[2], triplet[1]))
+            new_rel.add(rel1)
+            new_rel.add(rel2)
+            new_rel.add(rel3)
+            new_rel.add(rel4)
+        # 这里的三元组转换为new_rel
+        KGid2Graphid_map = defaultdict(int)
+        for i in range(len(graph_seq)):
+            KGid2Graphid_map[graph_seq[i]] = i
+
+        N = len(graph_seq)
+        adj = torch.zeros([N, N], dtype=torch.bool)
+        for item in list(new_rel):
+            adj[KGid2Graphid_map[item[0]], KGid2Graphid_map[item[1]]] = True
+        shortest_path_result, _ = algos.floyd_warshall(adj.numpy())
+        max_dist = np.amax(shortest_path_result)
+        # [PAD]部分， [CLS]部分补全， [SEP]额外引入也当作[PAD]处理
+        # 加上一个attention_bias, PAD部分设置为-inf，在送入model前，对其进行处理, 将其相加（让模型无法关注PAD）
+        
+        # 加入attention到huggingface的BertForMaskedLM（这个可能需要再去查查）
+        # attention_bias = torch.zero(N, N, dtype=torch.float)
+        # attention_bias[torch.tensor(shortest_path_result == )]
+        features = asdict(InputFeatures(input_ids=inputs["input_ids"],
+                                attention_mask=inputs['attention_mask'],
+                                labels=example['label'],
+                                label=example['real_label'],
+                                en=example['en_id'],
+                                rel=example['rel'],
+                                graph=example['text_d_id'],
+                                distance_attention = shortest_path_result.tolist(),
+            )
+        )
+        return features