doublesizebed commited on
Commit
15b2bb9
·
verified ·
1 Parent(s): a3c582d

Update README.md

Browse files

import fasttext
import string

# 1. Load your trained FastText LID model
model = fasttext.load_model("lid_ms_en.bin") # your trained English↔Malay detector :contentReference[oaicite:0]{index=0}

def tokenize(text):
"""
Simple tokenizer:
- lowercases text
- splits on whitespace
- strips leading/trailing punctuation from each token
"""
tokens = text.lower().split() # split on any whitespace :contentReference[oaicite:1]{index=1}
# strip punctuation from ends of each token
return [t.strip(string.punctuation) for t in tokens if t.strip(string.punctuation)]

def predict_per_token(sentence):
"""
Given a full sentence, return a list of (token, LANG) tuples
"""
preds = []
for token in tokenize(sentence):
label, _ = model.predict(token) # returns (['__label__ms'], [0.98]) :contentReference[oaicite:2]{index=2}
lang = label[0].replace("__label__", "").upper()
preds.append((token, lang))
return preds

# Example usage
input_sentence = "Saya suka chicken and fish pda hari Isnin!"
print(predict_per_token(input_sentence))

Files changed (1) hide show
  1. README.md +12 -3
README.md CHANGED
@@ -1,3 +1,12 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - imvladikon/leipzig_corpora_collection
5
+ language:
6
+ - ms
7
+ - en
8
+ base_model:
9
+ - facebook/fasttext-language-identification
10
+ pipeline_tag: text-classification
11
+ library_name: fasttext
12
+ ---