Update README.md

import fasttext
import string

# 1. Load your trained FastText LID model
model = fasttext.load_model("lid_ms_en.bin") # your trained English↔Malay detector :contentReference[oaicite:0]{index=0}

def tokenize(text):
"""
Simple tokenizer:
- lowercases text
- splits on whitespace
- strips leading/trailing punctuation from each token
"""
tokens = text.lower().split() # split on any whitespace :contentReference[oaicite:1]{index=1}
# strip punctuation from ends of each token
return [t.strip(string.punctuation) for t in tokens if t.strip(string.punctuation)]

def predict_per_token(sentence):
"""
Given a full sentence, return a list of (token, LANG) tuples
"""
preds = []
for token in tokenize(sentence):
label, _ = model.predict(token) # returns (['__label__ms'], [0.98]) :contentReference[oaicite:2]{index=2}
lang = label[0].replace("__label__", "").upper()
preds.append((token, lang))
return preds

# Example usage
input_sentence = "Saya suka chicken and fish pda hari Isnin!"
print(predict_per_token(input_sentence))

Files changed (1) hide show

README.md +12 -3

README.md CHANGED Viewed

@@ -1,3 +1,12 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+datasets:
+- imvladikon/leipzig_corpora_collection
+language:
+- ms
+- en
+base_model:
+- facebook/fasttext-language-identification
+pipeline_tag: text-classification
+library_name: fasttext
+---