Question generator

Jaabir

Published in

featurepreneur

3 min readJan 6, 2022

question generation using nlp and transformers

install the necessary modules

pip install sentence-transformers transformers

2. import the necessary modules

from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer, BertTokenizer, BertModel, AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import spacy
from transformers import BertTokenizer, BertModel
from warnings import filterwarnings as filt

filt('ignore')

3. initialize the pre-trained models

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained("bert-base-uncased")
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
nlp = spacy.load("en_core_web_sm")

4. create a function for the generating a question.

def get_question(sentence, answer):

  mdl = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
  tknizer = AutoTokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')

  text = "context: {} answer: {}".format(sentence,answer)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=300)


  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]


  Question = dec[0].replace("question:","")
  Question= Question.strip()
  return Question

this function requires two parameters :

sentence
answers

it generate question for the sentence based on the answers using the t5_squad_v1 pre-trained model and returns the list of questions

Answers can be anything like a key.

for eg :

sentence :

tact labs is a company.

answers :

company

returns :

['what is tact labs']

5. now create another function to generate meaningful keywords aka answers

def get_embedding(doc):

  bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  bert_model = BertModel.from_pretrained("bert-base-uncased")
  
  # txt = '[CLS] ' + doc + ' [SEP]'
  tokens = bert_tokenizer.tokenize(txt)
  token_idx = bert_tokenizer.convert_tokens_to_ids(tokens)
  segment_ids = [1] * len(tokens)

  torch_token = torch.tensor([token_idx])
  torch_segment = torch.tensor([segment_ids])

  return bert_model(torch_token, torch_segment)[-1].detach().numpy()

def get_pos(context):
  doc = nlp(context)
  docs = [d.pos_ for d in doc]
  return docs, context.split()

def get_sent(context):
  doc = nlp(context)
  return list(doc.sents)

def get_vector(doc):
  stop_words = "english"
  n_gram_range = (1,1)
  df = CountVectorizer(ngram_range = n_gram_range, stop_words = stop_words).fit([doc])
  return df.get_feature_names()


def get_key_words(context, module_type = 't'):
  keywords = []
  top_n = 5
  for txt in get_sent(context):
    keywd = get_vector(str(txt))
    print(f'vectors : {keywd}')
    if module_type == 't':
      doc_embedding = get_embedding(str(txt))
      keywd_embedding = get_embedding(' '.join(keywd))
    else:
      doc_embedding = model.encode([str(txt)])
      keywd_embedding = model.encode(keywd)
    
    distances = cosine_similarity(doc_embedding, keywd_embedding)
    print(distances)
    keywords += [(keywd[index], str(txt)) for index in distances.argsort()[0][-top_n:]]

  return keywords

it uses sklearn’s count vectorizer to get the important keywords and returns the list of tuples which contains both the sentence and the answer which we can then use it in generate_question function to get the questions

txt = 'Mauricio Pochettino open to leaving Paris St-Germain if Man Utd make approach'get_key_words(txt)------------------------------------------------------------------[('approach',
  'Mauricio Pochettino open to leaving Paris St-Germain if Man Utd make approach')]

now lets use the above text to generate the question

txt = 'Mauricio Pochettino open to leaving Paris St-Germain if Man Utd make approach'
for ans, context in get_key_words(txt, 'st'):
  print('=======================================')
  print()
  print(get_question(context, ans))
  print()
=================================================================Who is open to leaving Paris St-Germain if Man Utd make an approach?

=======================================

Who is open to leaving Paris St-Germain if Man Utd make an approach?

=======================================

Is Mauricio Pochettino open to leaving Paris St-Germain?

=======================================

Pochettino is open to leaving Paris St-Germain if Man Utd make an approach?

=======================================

What city is Mauricio Pochettino open to leaving if Man Utd make an approach?

Question generator

Written by Jaabir