注意:labels
- Richard:rather than spending a month foguring out an unsurpervised mathine learning problem, just label some data for a week and train a classifier.
原则:再好的模型也拯救不了shi一样的数据。
- pd.set_option("display.max_columns",None,"display.max_colwidth",200)
- df = pd.read_csv("")
- df.shape
- df.info()
- df.describe().round(2).T
- df.dtypes
- df["B"].dtype
- df.isnull()
- df["B"].isnull
- df["B"].unique()
- df.values
- df.columns
- df.head()
- df.tail()
- df.fillna(value = 0)
- df["B"].fillna(df["B"].mean())
- df["B"] = df["B"].map(str.strip) 清楚B字段的字符空格
- df["B"] = df["B"].str.lower() 大小写转换
- df["B"].astype("int")更改数据格式:
- df = df.rename(columns = {"B":"A"})更改列名
- df["B"].frop_duplicate(keep="last") 删除重复出现的值
- df["B"].replace("sh","shanhai") 数据替换]
- df = df[(True^df["class_label"].isin([2]))
主要使用groupby 和pivote_table
- df.groupby("city").count() 对所有列进行数据汇总
- df.groupby("city")["id"].count() 按城市对id 字段进行计数
- df.groupby(["city","size"])["id"].count()
- df_inner.groupby('city')['price'].agg([len,np.sum, np.mean])
- df["revivew"] = df.review.apply(lambda x:BeautifulSoup(x,"html.parser).get_text())
- df["text"].str.replace(r"[^A-Za-z0-9,.!'?]")
- df["text"].str.replace(r"@","at")
- df["text"].str.lower()
- df["text"]=df["text"].str.replace(r"i'm","i am")
- df["token"] = df.review.apply(nltk.word_tokenize)
- def stemmer(text):
- b = [];porter = nltk.PorterStermmer()
- for w in text:
- a = porter.stem(w);b.append(a)
- return b
- df["token"] = df.token.apply(stemer)
- def lemmer(text):
- b = [];porter = nltk.stem.WordNetLemmatizer()
- for w in text:
- a = porter.lemmatize(w);b.append(a)
- return b
- df["token"] = df.token.apply(lemmer)
- stop = stopwords.words("english")
- def remove(text):
- a = [w for w in text if w not in stop]
- return " ".join(a)
- df["token"] = df.token.apply(remove)
- df_inner = pd.merge(df,df1,how="inner")交集
- df_lef = pd.merge(df,df1,how = "left") 左
- df_right = pd.merge(df.df1,how="right")右
- df_outer = pd.merge(df,df1,how="outer) 并
- df_inner.set_index("id")
- df_inner.sort_values(by=["age"])
- df.loc[3]
- df.iloc[:3,:2]
- df.iloc[[0,2,5],[4,5]]
- df.reset_index()
- de.set_index("date)
- df["B"].isin(["beijing"]) 判断北京是否在B列
- pd.DataFrame(category.str[:3]) 提取前三个字符并生成数据表
- df.query('city == ["bei jing"]').price.sum()
- 简单的数据采样: df.sample(n=3)
- 手动设置权重: weights = [0,0.1,0.2], df.sample(n=2,weights=weights)
- 采样不放回: df.sanple(n=6,replace=False)
- 数据统计描述 df.describe().round(2).T round 设置显示小数位
- 计算标准差: df["B"].std()
- 计算协方差:df["B"].cov(df["A"])
- df.cov()
- 计算两个字段的相关性分析: df["B"].corr(df["A"])
- df.corr()
- df.to_excel("a.xlsx")
- df.to_csv("a.csv")
- list_corpus = df["text"].tolist()
- list_label = df["label"].tolist()
- from sklearn.model_selection import train_test_split
- x_train,x_test,y_train,y_test = train_test_split(list_corpus,list_label)
- from sklearn.feature_extraction.text import CountVectorizer
- def cv(text):
- counter = CounterVectorizer()
- emb = counter.fit_trainsform(text)
- return emb,counter
- x_train,counter = cv(x_train)
- x_test = counter.transform(text)
- from sklearn.feature_extraction.text import TfidfVectorizer
- def tfidf(text):
- tfidf = TfidfVectorizer()
- emb = tfidf.fit_transform(text)
- return emb,tfidf
- x_train.counter = tfidf(x_train)
- x_test = counter.transform(text)
- import gesim
- import numpy as np
- from gesim.models import Word2Vec
- model = Word2Vec(sentences=list_corpus,size=300,window=5,min_count=5,sample=1e-3,sg=1)
- model.save("bag") 注意:此处sentence 是分词过后的。例子:[["i","love","you"],["do","you","love","me"]]
用平均的方法将句子转换成向量 再进行训练集划分
- word2vector = Word2Vec.load("bag")
- def average(text,size=300)
- if len(text) < 1:
- return np.zeros(size)
- a = [word2vector[w] if w in word2vector else np.zeros(size) for w in text]
- length = len(a)
- summed = np.sum(a,axis=0)
- ave = np.divide(summed,length)
- return ave
- df["text"] = df["text"].apply(average) 注意此处的df["text"] 未分词
- list_corpus = df["text"].tolist()
- list_label = df["label"].tolist()
- x _train,x_test,y_train,y_test = trian_test_split(list_corpus,list_label,test_size=0.2,random_state=1.0)
- from sklean.linear_model import LogisticRegression
- clf = LogisticRegression(penalty="l2",C=1.0,class_weight="balanced",n_jobs=-1,random_state=1.0,solver="newton-cg")
- clf.fit(x_train,y_train)
- y_predict = clf.fit(x_test)
- from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score
- from sklearn.metrics import confusion_matrix
- precision = precision_score(y_test,y_predic,pos_label=None,average="weighted")
- accuracy = accuracy_score(y_test,y_predict)
- recall = recall_score(y_test,y_predict,,pos_label=None,average="weighted")
- f1 = f1_score(y_test,y_predict,pos_label,average="weighted")
- cm = confusion_matrix(y_test,y_predict)
TF-IDF:关键词、可视化嵌入
word2vec、glove、cove