Update README.md
Browse filesimport fasttext
import string
# 1. Load your trained FastText LID model
model = fasttext.load_model("lid_ms_en.bin") # your trained English↔Malay detector :contentReference[oaicite:0]{index=0}
def tokenize(text):
"""
Simple tokenizer:
- lowercases text
- splits on whitespace
- strips leading/trailing punctuation from each token
"""
tokens = text.lower().split() # split on any whitespace :contentReference[oaicite:1]{index=1}
# strip punctuation from ends of each token
return [t.strip(string.punctuation) for t in tokens if t.strip(string.punctuation)]
def predict_per_token(sentence):
"""
Given a full sentence, return a list of (token, LANG) tuples
"""
preds = []
for token in tokenize(sentence):
label, _ = model.predict(token) # returns (['__label__ms'], [0.98]) :contentReference[oaicite:2]{index=2}
lang = label[0].replace("__label__", "").upper()
preds.append((token, lang))
return preds
# Example usage
input_sentence = "Saya suka chicken and fish pda hari Isnin!"
print(predict_per_token(input_sentence))
@@ -1,3 +1,12 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
datasets:
|
4 |
+
- imvladikon/leipzig_corpora_collection
|
5 |
+
language:
|
6 |
+
- ms
|
7 |
+
- en
|
8 |
+
base_model:
|
9 |
+
- facebook/fasttext-language-identification
|
10 |
+
pipeline_tag: text-classification
|
11 |
+
library_name: fasttext
|
12 |
+
---
|