Skip to content

Commit

Permalink
Add code up until tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
jteijema committed Nov 5, 2021
1 parent ccdfdf7 commit 292f093
Showing 1 changed file with 48 additions and 7 deletions.
55 changes: 48 additions & 7 deletions asreviewcontrib/semantic_clustering/semantic_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,63 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# import
# import ASReview
from tqdm import tqdm
from asreview.data import ASReviewData

# import numpy
import numpy as np

class SemanticClustering():
def __init__(self, data: ASReviewData):
self.data = data
# import transformer autotokenizer and automodel
from transformers import AutoTokenizer, AutoModel

# create ASReview data object
# disable transformer warning
from transformers import logging
logging.set_verbosity_error()

#import tqdm

def load_data(ASReviewDataObject):

data = ASReviewDataObject.df[['title', 'abstract']].copy()
def SemanticClustering(asreview_data_object):

# load data
print("Loading data...")
data = load_data(asreview_data_object)

# cut data for testing
data = data.iloc[:10, :]

# load scibert transformer
print("Loading scibert transformer...")
transformer = 'allenai/scibert_scivocab_uncased'

# load transformer and tokenizer
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(transformer)
model = AutoModel.from_pretrained(transformer)

# tokenize abstracts and add to data
print("Tokenizing abstracts...")
data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode(
x,
padding='longest',
add_special_tokens=True,
return_tensors="pt"))

print(data)


def load_data(asreview_data_object):

# extract title and abstract, drop empty abstracts and reset index
data = asreview_data_object.df[['title', 'abstract']].copy()
data['abstract'] = data['abstract'].replace('', np.nan, inplace=False)
data.dropna(subset=['abstract'], inplace=True)
data = data.reset_index(drop=True)

return data


if __name__ == "__main__":
filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv"
SemanticClustering(ASReviewData.from_file(filepath))

0 comments on commit 292f093

Please sign in to comment.