Kaggle挑戦記

KaggleのTitanicに参加した記録。 データを目で見て分かったのは、

  • 兄弟の少ない子供は比較的助かった
  • 兄弟の多い子供は比較的助からなかった

ということぐらいでしたが、機械学習で分類器を作った結果(データを読み込ませてSVMで分類させただけという何の工夫もないやり方ですが・・・)、0.77990というスコアになりました。そのうち細かいチューニングもして再チャレンジしてみます。

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
train = pd.read_csv("train.csv")
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
train = train.drop(["PassengerId","Name","Ticket","Cabin"],axis=1)
train.head()
Survived Pclass Sex Age SibSp Parch Fare Embarked
0 0 3 male 22.0 1 0 7.2500 S
1 1 1 female 38.0 1 0 71.2833 C
2 1 3 female 26.0 0 0 7.9250 S
3 1 1 female 35.0 1 0 53.1000 S
4 0 3 male 35.0 0 0 8.0500 S
# ここの内容は分類器には関係ない。データを理解するためにやっただけ。

import matplotlib.gridspec as gridspec

Survive = train[train["Survived"]==1]
Dead = train[train["Survived"]==0]

transparency = 0.1

plt.figure(figsize=(16, 5))
# 兄弟、配偶者の数
plt.subplot(1,2,1)
plt.scatter(Survive["Age"], Survive["SibSp"], c="blue", alpha=transparency, label="Survived")
plt.scatter(Dead["Age"], Dead["SibSp"], c="red", alpha=transparency, label="Dead")
plt.xlabel("Age")
plt.ylabel("SibSp")
plt.legend()

plt.subplot(1,2,2)
# 親、子供の数
plt.scatter(Survive["Age"], Survive["Parch"], c="blue", alpha=transparency, label="Survived")
plt.scatter(Dead["Age"], Dead["Parch"], c="red", alpha=transparency, label="Dead")
plt.xlabel("Age")
plt.ylabel("Parch")
plt.legend()
plt.show()

f:id:takekbys:20171008213120p:plain

# データ欠損数を確認
train.isnull().sum()
Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64
# 欠損値を埋める候補
print(train["Age"].mean())
# print(train["Embarked"].mode)
print(train["Embarked"].value_counts())
29.6991176471
S    644
C    168
Q     77
Name: Embarked, dtype: int64
# 欠損値を埋める
train["Age"] = train["Age"].fillna(train["Age"].mean())
train["Embarked"] = train["Embarked"].fillna("S")
# データ欠損数を確認
train.isnull().sum()
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
# EmbarkedとAgeを数値に変換
train["Sex"] = train["Sex"].map({"male": 0, "female": 1})
train["Embarked"] = train["Embarked"].map({"C":0,"Q":1,"S":2})
# AgeとFareを正規化
train["Age"] = (train["Age"]-train["Age"].mean())/(train["Age"].max()-train["Age"].min())
train["Fare"] = (train["Fare"]-train["Fare"].mean())/(train["Fare"].max()-train["Fare"].min())
# testの読み込み
test = pd.read_csv("test.csv")
test["Sex"] = test["Sex"].map({"male": 0, "female": 1})
test["Embarked"] = test["Embarked"].map({"C":0,"Q":1,"S":2})
test["Age"] = test["Age"].fillna(test["Age"].mean())
test["Fare"] = test["Fare"].fillna(test["Fare"].mean())
test["Age"] = (test["Age"]-test["Age"].mean())/(test["Age"].max()-test["Age"].min())
test["Fare"] = (test["Fare"]-test["Fare"].mean())/(test["Fare"].max()-test["Fare"].min())
test = test.drop(["Name", "Ticket", "Cabin"], axis=1)

# Machine learning で使うデータとセットに分ける。
X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape
((891, 7), (891,), (418, 7))
# 学習
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, random_state=42)
result = cross_val_score(SVC(),X_train,Y_train, cv = kfold, scoring = "accuracy")
print(result)
[ 0.78888889  0.84269663  0.76404494  0.85393258  0.79775281  0.80898876
  0.78651685  0.7752809   0.85393258  0.79775281]
classifier = SVC()

classifier.fit(X_train, Y_train)
result = classifier.predict(X_test)
#  提出する
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": result
    })

submission.to_csv("submission.csv", index=False)