Initial Commit
Browse files
README.md
CHANGED
@@ -1,3 +1,151 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Neurobiber: Fast and Interpretable Stylistic Feature Extraction
|
2 |
+
|
3 |
+
**Neurobiber** is a transformer-based model that quickly predicts **96 interpretable stylistic features** in text. These features are inspired by Biber’s multidimensional framework of linguistic style, capturing everything from **pronouns** and **passives** to **modal verbs** and **discourse devices**. By combining a robust linguistically informed feature set with the speed of neural inference, NeuroBiber enables large-scale stylistic analyses that were previously infeasible.
|
4 |
+
|
5 |
+
## Why Neurobiber?
|
6 |
+
|
7 |
+
Extracting Biber-style features typically involves running a full parser or specialized tagger, which can be computationally expensive for large datasets or real-time applications. NeuroBiber overcomes these challenges by:
|
8 |
+
- **Operating up to 56x faster** than parsing-based approaches.
|
9 |
+
- Retaining the **interpretability** of classical Biber-like feature definitions.
|
10 |
+
- Delivering **high accuracy** on diverse text genres (e.g., social media, news, literary works).
|
11 |
+
- Allowing seamless integration with **modern deep learning** pipelines via Hugging Face.
|
12 |
+
|
13 |
+
By bridging detailed linguistic insights and industrial-scale performance, Neurobiber supports tasks in register analysis, style transfer, and more.
|
14 |
+
|
15 |
+
## Example Script
|
16 |
+
|
17 |
+
Below is an **example** showing how to load Neurobiber from Hugging Face, process single or multiple texts, and obtain a 96-dimensional binary vector for each input.
|
18 |
+
|
19 |
+
```python
|
20 |
+
import torch
|
21 |
+
import numpy as np
|
22 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
23 |
+
|
24 |
+
MODEL_NAME = "blablablab/neurobiber"
|
25 |
+
CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512
|
26 |
+
|
27 |
+
# List of the 96 features that Neurobiber can predict
|
28 |
+
BIBER_FEATURES = [
|
29 |
+
"BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
|
30 |
+
"BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
|
31 |
+
"BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
|
32 |
+
"BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
|
33 |
+
"BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
|
34 |
+
"BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
|
35 |
+
"BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
|
36 |
+
"BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
|
37 |
+
"BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
|
38 |
+
"BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
|
39 |
+
"BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
|
40 |
+
"BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
|
41 |
+
"BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
|
42 |
+
"BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
|
43 |
+
"BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
|
44 |
+
"BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
|
45 |
+
]
|
46 |
+
|
47 |
+
def load_model_and_tokenizer():
|
48 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
|
49 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to("cuda")
|
50 |
+
model.eval()
|
51 |
+
return model, tokenizer
|
52 |
+
|
53 |
+
def chunk_text(text, chunk_size=CHUNK_SIZE):
|
54 |
+
tokens = text.strip().split()
|
55 |
+
if not tokens:
|
56 |
+
return []
|
57 |
+
return [" ".join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
|
58 |
+
|
59 |
+
def get_predictions_chunked_batch(model, tokenizer, texts, chunk_size=CHUNK_SIZE, subbatch_size=32):
|
60 |
+
chunked_texts = []
|
61 |
+
chunk_indices = []
|
62 |
+
for idx, text in enumerate(texts):
|
63 |
+
start = len(chunked_texts)
|
64 |
+
text_chunks = chunk_text(text, chunk_size)
|
65 |
+
chunked_texts.extend(text_chunks)
|
66 |
+
chunk_indices.append({
|
67 |
+
'original_idx': idx,
|
68 |
+
'chunk_range': (start, start + len(text_chunks))
|
69 |
+
})
|
70 |
+
|
71 |
+
# If there are no chunks (empty inputs), return zeros
|
72 |
+
if not chunked_texts:
|
73 |
+
return np.zeros((len(texts), model.config.num_labels))
|
74 |
+
|
75 |
+
all_chunk_preds = []
|
76 |
+
for i in range(0, len(chunked_texts), subbatch_size):
|
77 |
+
batch_chunks = chunked_texts[i : i + subbatch_size]
|
78 |
+
encodings = tokenizer(
|
79 |
+
batch_chunks,
|
80 |
+
return_tensors='pt',
|
81 |
+
padding=True,
|
82 |
+
truncation=True,
|
83 |
+
max_length=chunk_size
|
84 |
+
).to("cuda")
|
85 |
+
|
86 |
+
with torch.no_grad(), torch.amp.autocast("cuda"):
|
87 |
+
outputs = model(**encodings)
|
88 |
+
probs = torch.sigmoid(outputs.logits)
|
89 |
+
all_chunk_preds.append(probs.cpu())
|
90 |
+
|
91 |
+
all_chunk_preds = torch.cat(all_chunk_preds, dim=0) if all_chunk_preds else torch.empty(0)
|
92 |
+
predictions = [None] * len(texts)
|
93 |
+
|
94 |
+
for info in chunk_indices:
|
95 |
+
start, end = info['chunk_range']
|
96 |
+
if start == end:
|
97 |
+
# No tokens => no features
|
98 |
+
pred = torch.zeros(model.config.num_labels)
|
99 |
+
else:
|
100 |
+
# Take max across chunks for each feature
|
101 |
+
chunk_preds = all_chunk_preds[start:end]
|
102 |
+
pred, _ = torch.max(chunk_preds, dim=0)
|
103 |
+
predictions[info['original_idx']] = (pred > 0.5).int().numpy()
|
104 |
+
|
105 |
+
return np.array(predictions)
|
106 |
+
|
107 |
+
def predict_batch(model, tokenizer, texts, chunk_size=CHUNK_SIZE, subbatch_size=32):
|
108 |
+
return get_predictions_chunked_batch(model, tokenizer, texts, chunk_size, subbatch_size)
|
109 |
+
|
110 |
+
def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32):
|
111 |
+
batch_preds = predict_batch(model, tokenizer, [text], chunk_size, subbatch_size)
|
112 |
+
return batch_preds[0]
|
113 |
+
```
|
114 |
+
|
115 |
+
## Single-Text Usage
|
116 |
+
``` python
|
117 |
+
model, tokenizer = load_model_and_tokenizer()
|
118 |
+
sample_text = "This is a sample text demonstrating certain stylistic features."
|
119 |
+
predictions = predict_text(model, tokenizer, sample_text)
|
120 |
+
print("Binary feature vector:", predictions)
|
121 |
+
# For example: [0, 1, 0, 1, ... 1, 0] (96-length)
|
122 |
+
|
123 |
+
```
|
124 |
+
|
125 |
+
## Batch Usage
|
126 |
+
``` python
|
127 |
+
|
128 |
+
docs = [
|
129 |
+
"First text goes here.",
|
130 |
+
"Second text, slightly different style."
|
131 |
+
]
|
132 |
+
model, tokenizer = load_model_and_tokenizer()
|
133 |
+
preds = predict_batch(model, tokenizer, docs)
|
134 |
+
print(preds.shape) # (2, 96)
|
135 |
+
```
|
136 |
+
|
137 |
+
|
138 |
+
## How It Works
|
139 |
+
|
140 |
+
Neurobiber is fine-tuned RoBERTa. Given a text:
|
141 |
+
1. The text is split into **chunks** (up to 512 tokens each).
|
142 |
+
2. Each chunk is fed through the model to produce **96 logistic outputs** (one per feature).
|
143 |
+
3. The feature probabilities are aggregated across chunks so that each feature is marked as `1` if it appears in at least one chunk.
|
144 |
+
|
145 |
+
Each row in preds is a 96-element array corresponding to the feature order in BIBER_FEATURES.
|
146 |
+
|
147 |
+
Interpreting Outputs
|
148 |
+
|
149 |
+
- Each element in the vector is a binary label (0 or 1), indicating the model’s detection of a specific linguistic feature (e.g., BIN_VBD for past tense verbs).
|
150 |
+
- For long texts, we chunk them into segments of length 512 tokens. If a feature appears in any chunk, you get a 1 for that feature.
|
151 |
+
|