本次任务两天完成
# 引入 Pandas
import pandas as pd
# 读取数据
train_set = pd.read_csv('./data/1/train_set.csv', sep='\t', nrows=15000)
# 看一下数据的前几行
train_set.head()
import pandas as pd
from sklearn.metrics import f1_score
# 转换为FastText需要的格式
train_set['label_ft'] = '__label__' + train_set['label'].astype(str)
train_set
import fasttext
# 提取前1.5万行数据,只取 text,label_ft两列,存成文件
train_set[['text','label_ft']].iloc[:-5000].to_csv('./data/1/fasttext_train.csv', index=None, header=None, sep='\t')
# 读取文件成模型
model = fasttext.train_supervised('./data/1/fasttext_train.csv', lr=1.0, wordNgrams=2,
verbose=2, minCount=1, epoch=25, loss="hs")
# 使用模型预测后0.5万
val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_set.iloc[-5000:]['text']]
# 打印预测分数
print(f1_score(train_set['label'].values[-5000:].astype(str), val_pred, average='macro'))
# 0.8448025413001482
# 测试全量
train_set = pd.read_csv('./data/1/train_set.csv', sep='\t')
train_set['label_ft'] = '__label__' + train_set['label'].astype(str)
train_set[['text','label_ft']].iloc[:-10000].to_csv('./data/1/fasttext_train.csv', index=None, header=None, sep='\t')
model = fasttext.train_supervised('./data/1/fasttext_train.csv', lr=1.0, wordNgrams=2,
verbose=2, minCount=1, epoch=25, loss="hs")
val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_set.iloc[-10000:]['text']]
# 打印预测分数
print(f1_score(train_set['label'].values[-10000:].astype(str), val_pred, average='macro'))
# 0.9132021029586797
git clone https://github.com/facebookresearch/fastText.git
cd fastText
# pip3 install .
sudo pip install .
# 安装一下 open ssl 模块
brew install OpenSSL
# 如果还不行,导入anaconda环境到 ~/.zshrc
export PATH="/Users/zain/app/ds/anaconda3/bin:$PATH"
# 如果还不行,重新安排pip或pip3
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
# 用哪个版本的 Python 运行安装脚本,pip 就被关联到哪个版本
sudo python3 get-pip.py
# 下载 pybind11 源码并手动安装
git clone https://github.com/pybind/pybind11.git
cd pybind11
# pip3 install .
pip install .