In [1]:
# Use the trained astroBERT model to make NER predictions

# Tutorial 2 - Using astroBERT to make NER Prediction
This tutorials shows you how to use astroBERT to make NER predictions.  
For a list and description of the labels, see DEAL@WIESP 2022 (https://ui.adsabs.harvard.edu/WIESP/2022/LabelDefinitions)

In [2]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

2022-10-31 11:27:34.285610: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
# load astroBERT for NER-DEAL
remote_model_path = 'adsabs/astroBERT'
# you need to load the astroBERT trained for NER-DEAL, which is on a seperate branch
revision = 'NER-DEAL'

astroBERT_NER_DEAL = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=remote_model_path,
                                                                     revision=revision,
                                                                    )

astroBERT_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=remote_model_path,
                                                    add_special_tokens=True,
                                                    do_lower_case=False,
                                                   )

In [4]:
from transformers import TokenClassificationPipeline

In [5]:
# use the Hugginface Pipeline class
NER_pileline = TokenClassificationPipeline(model = astroBERT_NER_DEAL,
                                           tokenizer = astroBERT_tokenizer,
                                           task='astroBERT NER_DEAL',
                                           aggregation_strategy='average',
                                           ignore_labels=['O'],
                                          )

In [6]:
# make predictions on text of your choice
text = 'The National Aeronautics and Space Administration (NASA /ˈnæsə/) is an independent agency of the US federal government responsible for the civil space program, aeronautics research, and space research.[note 1] NASA was established in 1958, succeeding the National Advisory Committee for Aeronautics (NACA), to give the U.S. space development effort a distinctly civilian orientation, emphasizing peaceful applications in space science.[7][8][9] Since its establishment, most American space exploration efforts have been led by NASA, including the Apollo Moon landing missions, the Skylab space station, and later the Space Shuttle. NASA supports the International Space Station and oversees the development of the Orion spacecraft, the Space Launch System, Commercial Crew vehicles, and the planned Lunar Gateway space station. The agency is also responsible for the Launch Services Program'
pred = NER_pileline(text)    

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
# predictions include a 
pred

[{'entity_group': 'Organization',
  'score': 0.8618845,
  'word': 'National Aeronautics and Space Administration',
  'start': 4,
  'end': 49},
 {'entity_group': 'Organization',
  'score': 0.5089845,
  'word': '( NASA / ˈnæsə / )',
  'start': 50,
  'end': 64},
 {'entity_group': 'Location',
  'score': 0.84241116,
  'word': 'US',
  'start': 97,
  'end': 99},
 {'entity_group': 'Organization',
  'score': 0.35269254,
  'word': 'aeronautics',
  'start': 160,
  'end': 171},
 {'entity_group': 'Organization',
  'score': 0.5513096,
  'word': 'NASA',
  'start': 210,
  'end': 214},
 {'entity_group': 'Organization',
  'score': 0.81638354,
  'word': 'National Advisory Committee for Aeronautics',
  'start': 255,
  'end': 298},
 {'entity_group': 'Organization',
  'score': 0.60596126,
  'word': '( NACA )',
  'start': 299,
  'end': 305},
 {'entity_group': 'Location',
  'score': 0.9698747,
  'word': 'U',
  'start': 319,
  'end': 320},
 {'entity_group': 'Location',
  'score': 0.8322766,
  'word': 'S',
  's

In [8]:
# view the results using custom function
def format_pred_for_print(pred, paragraph):
    '''
    returns a pretty string with the predictions in paragraph highlighted.
    pred: prediction output from a pipeline
    paragraph: the original text the predictions were made on
    '''
    
    RED_START = '\x1b[31m'
    RED_END = '\x1b[0m'
    
    formatted_string=''
    end=0
    
    for entry in pred:
        start = entry['start']
        # add what's in between
        formatted_string += paragraph[end:start]
        # add the entry
        end = entry['end']
        label = entry['entity_group']
        score = ' {:.2f}'.format(entry['score'])
        formatted_string+= RED_START+'['+paragraph[start:end]+' ('+label+score+')]'+RED_END
        
    formatted_string+= paragraph[end:]
    return(formatted_string)


In [9]:
print(format_pred_for_print(pred, text))

The [31m[National Aeronautics and Space Administration (Organization 0.86)][0m [31m[(NASA /ˈnæsə/) (Organization 0.51)][0m is an independent agency of the [31m[US (Location 0.84)][0m federal government responsible for the civil space program, [31m[aeronautics (Organization 0.35)][0m research, and space research.[note 1] [31m[NASA (Organization 0.55)][0m was established in 1958, succeeding the [31m[National Advisory Committee for Aeronautics (Organization 0.82)][0m [31m[(NACA) (Organization 0.61)][0m, to give the [31m[U (Location 0.97)][0m.[31m[S (Location 0.83)][0m. space development effort a distinctly civilian orientation, emphasizing peaceful applications in space science.[7][8][9] Since its establishment, most American space exploration efforts have been led by [31m[NASA (Organization 0.50)][0m, including the Apollo Moon landing missions, the [31m[Skylab (Mission 0.55)][0m space station, and later the Space [31m[Shuttle (Mission 0.91)][0m. NASA supports the 