本地加载
环境依赖:
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple transformer sentencepiece
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
模型下载:
https://hf-mirror.com/facebook/nllb-200-distilled-600M
支持的语言:
https://hf-mirror.com/facebook/nllb-200-distilled-600M/blob/main/README.md
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
content = """
A database of Chinese surnames and Chinese given names (1930-2008). This database contains nationwide frequency statistics of 1,806 Chinese surnames and 2,614 Chinese characters used in given names, covering about 1.2 billion Han Chinese population (96.8% of the Han Chinese household-registered population born from 1930 to 2008 and still alive in 2008). This package also contains a function for computing multiple features of Chinese surnames and Chinese given names for scientific research (e.g., name uniqueness, name gender, name valence, and name warmth/competence).
"""
inputs = tokenizer(content, return_tensors="pt")
translated_tokens = model.generate(
**inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"), num_beams=4,
)
for translated in tokenizer.batch_decode(translated_tokens, skip_special_tokens=True):
print(translated)
拆分长文本
sudo apt-get install -y g++
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple stopes[mono] botok khmer-nltk laonlp
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo
tokenizer = AutoTokenizer.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
content = """
A database of Chinese surnames and Chinese given names (1930-2008). This database contains nationwide frequency statistics of 1,806 Chinese surnames and 2,614 Chinese characters used in given names, covering about 1.2 billion Han Chinese population (96.8% of the Han Chinese household-registered population born from 1930 to 2008 and still alive in 2008). This package also contains a function for computing multiple features of Chinese surnames and Chinese given names for scientific research (e.g., name uniqueness, name gender, name valence, and name warmth/competence).
"""
# now split the content into individual sentences, just as NLLB was supposed to work!
splitter = get_split_algo("eng", "default")
input_sentences = list(splitter(content))
print(len(input_sentences)) # 3
inputs = tokenizer(input_sentences, return_tensors="pt", padding=True)
translated_tokens = model.generate(
**inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"), num_beams=4,
)
for translated in tokenizer.batch_decode(translated_tokens, skip_special_tokens=True):
print(translated)
cuda加载
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# 检查CUDA是否可用
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available on this machine.")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
# 将模型移动到GPU
model.to(device)
content = """
A database of Chinese surnames and Chinese given names (1930-2008). This database contains nationwide frequency statistics of 1,806 Chinese surnames and 2,614 Chinese characters used in given names, covering about 1.2 billion Han Chinese population (96.8% of the Han Chinese household-registered population born from 1930 to 2008 and still alive in 2008). This package also contains a function for computing multiple features of Chinese surnames and Chinese given names for scientific research (e.g., name uniqueness, name gender, name valence, and name warmth/competence).
"""
# 对输入进行编码,并将结果张量移动到GPU
inputs = tokenizer(content, return_tensors="pt").to(device)
translated_tokens = model.generate(
**inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"), num_beams=4,
)
# 将生成的token解码为文本
for translated in tokenizer.batch_decode(translated_tokens, skip_special_tokens=True):
print(translated)
del inputs
torch.cuda.empty_cache()
pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
tokenizer = AutoTokenizer.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
translator = pipeline(
'translation',
model=model,
tokenizer=tokenizer,
src_lang='zho_Hans',
tgt_lang='eng_Latn',
max_length=512
)
print(translator(["你好 世界", "青春", ]))