from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset="train", shuffle=True)
twenty_train.target_names
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
print("\n".join(twenty_train.data[0].split("\n")[:3]))
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
from sklearn.feature_extraction.text import CountVectorizer
count_vects = CountVectorizer()
X_train_counts = count_vects.fit_transform(twenty_train.data)
X_train_counts.shape
(11314, 130107)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
(11314, 130107)
X_train_tfidf
<11314x130107 sparse matrix of type '<class 'numpy.float64'>'
	with 1787565 stored elements in Compressed Sparse Row format>
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) 
from sklearn.pipeline import Pipeline
text_clf = Pipeline([( 'vect', CountVectorizer()),
                     ( 'tfidf', TfidfTransformer()),
                     ( 'clf', MultinomialNB()),
                    ])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
import numpy as np
twenty_test = fetch_20newsgroups(subset = 'test', shuffle=True)
predicted = text_clf. predict(twenty_test.data)
np.mean(predicted == twenty_test.target)
0.7738980350504514
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([( 'vect', CountVectorizer()),
                         ( 'tfidf', TfidfTransformer()),
                         ( 'clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                                    alpha=1e-3, n_iter=5, random_state=42)),
                        ])
/srv/paws/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.
  DeprecationWarning)
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)
0.82381837493361654
from sklearn.model_selection import GridSearchCV
parameters={'vect__ngram_range': [(1,1), (1,2)],
            'tfidf__use_idf': [ True, False],
            'clf__alpha': (1e-2, 1e-3),
           }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
gs_clf.best_score_
0.90675269577514583