第一列为:输入
第二列为:输出
药品分类数据
import jieba
cutname=lambda x : ' '.join(jieba.lcut(x))
data['药品名称']=data['药品名称'].Apply(cutname)
x_train,x_test,y_train,y_test=train_test_split(data.药品名称,data.药品类型,test_size=0.3)
# 向量化(vectorizer) => 转换器(transformer) => 分类器(classifier)
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB #贝叶斯
text_clf = Pipeline([ ('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])text_clf.fit(x_train, y_train)#训练
print(classification_report(y_test,text_clf.predict(x_test)))
predicted = text_clf.predict(x_test)np.mean(predicted == y_test)
模型评估
模型评估
text_clf.predict(['阿莫西林'])
阿莫西林药品分类