import numpy as np import pandas as pd import seaborn as sns from scipy.integrate import odeint from scipy.optimize import curve_fit import matplotlib.pyplot as plt %matplotlib inline import statsmodels.api as sm from sklearn import metrics from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import GridSearchCV from sklearn.decomposition import PCA from sklearn.svm import SVC
column1, column2 = np.loadtxt('data.csv', usecols = (0,1), unpack=True, skiprows = 1, delimiter=',')
#Find the best parameters to fit our data parameters = np.polyfit(x, y, 2) #Create a function from the parameters fit_function = np.poly1d(parameters) #Use that function to calculate the values our fit line expected_y = fit_function(x) #Plot the original data and our fit line plt.scatter(x, y, label = 'data') plt.plot(x, expected_y, color = 'orange', label = 'fit')
data_frame = pd.read_csv('data.csv',encoding='ISO-8859-1')
#create DataFrame data_frame = pd.DataFrame({"name":["james","jim","joan","jill"], "age":[10, 20, 30, 40], "weight":[150, 140, 130, 120], "complaint": ["sprain", "cut", "headache", "break"]}) #displays the top part of the dataframe data_frame.head() #displays the bottom part of the dataframe data_frame.tail() #set row index to the age column data_frame.set_index("age", inplace=True) #reset row index to 0-based data_frame.reset_index(inplace=True) #returns row with row index data_frame.loc["rowname"] #returns row with 0-based index data_frame.iloc[2] #adds/replaces a columns data_frame = data_frame.assign(col_name = col_content) #removes columns from the dataframe data_frame = data_frame.drop(columns = ["year", "month", "day"]) #displays various stats about the data frame data_frame.describe() #a list of columns in the dataframe data_frame.columns #converts a dataframe with year month day columns to a series of datetimes pd.to_datetime(data_frame) #produces a histogram of a column data_frame['col1'].plot.hist() #produces a scatter plot of two columns data_frame.plot.scatter(x='col2',y='col1') #produces a boxplot of two columns data_frame.boxplot(column = ['col1','col2'])
#create Series series = pd.Series([1,2,3,4], index=['one', 'two', 'three', 'four']) #mean of series series.mean() #list of indexes in the series series.index.values #list of values in the series series.values
sns.set() #Sets styling to seaborn sns.reset_orig() #Returns styling to default sns.despine() #Removes borders #List of various styles: sns.set_style("ticks") sns.set_style("darkgrid") sns.set_style("whitegrid") sns.set_style("dark") sns.set_style("white") #List of various contexts: sns.set_context("paper") sns.set_context("talk") sns.set_context("poster") sns.set_context("notebook") #Jointplot scatter sns.jointplot(data_frame['col1'], data_frame['col2'], kind="reg")
sea.set(font_scale=.7) hm = sea.heatmap(data=data.drop(columns=["year", "Country name"]).corr(), annot=True) hm.set_ylim(12.0,0.0)
#function that we will fit the data to def curve_function(x, a, b, c): return a * np.cos(b * x + c) #use curve_fit to get the parameters for the function paramters, matrix = curve_fit(curve_function, x, y) a = paramters[0] # get fitted a parameter b = paramters[1] # get fitted b parameter c = paramters[2] # get fitted c parameter #use our function to calculate the expected values of y y_expected = curve_function(x, a, b, c) # plot our data and fit line plt.scatter(x, y, label = "data") plt.plot(x, y_expected, color = "orange", label = "fit")
For example, if we have a system of variables X, Y, and Z with time derivatives:
where a and b are parameters for the derivatives.
We can use the following model to solve the equations:
def derivatives(state, time, a, b): # Put the parameters here X = state[0] Y = state[1] Z = state[2] dXdt = Y + a * time # Set the derivative equations here dYdt = Z dZdt = b return [dXdt, dYdt, dZdt] a = 0.5 # Set the values of your parameters here b = 4/3 init = [1,.1, 2] # Set the initial conditions here time = np.linspace(0,50,500) # Set the time that you plan on integrating across # For this example we will run from 0 to 50 with 500 intervals solution = odeint(derivatives, init, time, args=(a, b))# Call odeint with the initial condition and parameters X = solution[:,0] Y = solution[:,1] Z = solution[:,2]
plt.figure(1, figsize=(12,7)) # creates and sets the current figure with figure number 1 # change this '1' to a different number if you need more than one figure plt.subplot(224) # selects the fourth plot in a 2 by 2 grid of subplots # this line is unnecessary if you only need to display one plot plt.plot(time, X, label='x-values') plt.plot(time, Y, label='y-values') plt.plot(time, Z, label='z-values') plt.grid() # enables the grid plt.legend() # enables the legend plt.xlabel('Time') # creates a label for the x axis plt.ylabel('Calculated Values') # creates a label for the plt.title('Example Title') # adds a title for the whole plot plt.tight_layout() # prevents the titles from overlapping plt.savefig("Example.png") # saves the figure as an imageOutput:
Diagram of plt.subplot():
heatmap = [] for i in range(100): heatmap.append([]) for j in range(100): heatmap[i].append(i*j) plt.matshow(heatmap)
x_with_cnst = sm.add_constant(x_values) # creates an array of x values with a column of 1s linear_model = sm.OLS(y_values, x_with_cnst) # creates the model results = linear_model.fit() # calculates the best fit parameters results.params # Returns the intercept and slope results.summary() # Returns the a summary of the model results.predict() # Returns an array of predicted y_values
model = sm.OLS(data["x_values"], data.drop(columns=["x_values"])) result = model.fit() result.summary()
fig = plt.figure(figsize=(15,8)) fig = sm.graphics.plot_regress_exog(results, "x1", fig=fig)
print("Confusion matrix:\n",metrics.confusion_matrix(test_labels, test_predict)) print("Classification report:\n",metrics.classification_report(test_labels, test_predict)) print("Accuracy:",metrics.accuracy_score(test_labels, test_predict)) print("Precision:",metrics.precision_score(test_labels, test_predict)) print("Recall:",metrics.recall_score(test_labels, test_predict)) print("F_1 Score:",metrics.f1_score(test_labels, test_predict))
fpr, tpr, thresholds = metrics.roc_curve(test_labels, test_predict) plt.plot(fpr,tpr) plt.plot([0,1],[0,1],label="chance line") plt.legend() print(metrics.auc(fpr,tpr))
vectors, labels, centers = datasets.make_blobs(n_samples=100, n_features=2, centers=2, random_state=2614, return_centers=True) vectors, labels = datasets.make_circles(n_samples=100, random_state=655) sk_data = datasets.load_digits(); vectors, labels, categories = (sk_data.data, sk_data.target, sk_data.target_names) sk_data = datasets.fetch_lfw_people(min_faces_per_person=50, resize=0.4) vectors, labels, categories = (sk_data.data, sk_data.target, sk_data.target_names)
train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors, labels)
logit_model = sm.Logit(train_labels, sm.add_constant(train_vectors)) result = logit_model.fit() test_predict = result.predict(sm.add_constant(test_vectors_dropped))
scalar = StandardScaler() vectors = scalar.fit_transform(vectors)
knn = KNeighborsClassifier(n_neighbors=5) result = knn.fit(train_vectors, train_labels)
n_components = 17 # This is much less than the original n_features print("Extracting the top %d eigenfaces from %d faces" % (n_components, train_vectors.shape[0])) #Set up the pca object with the number of compoents we want to find pca = PCA(n_components=n_components, whiten=True) #Fit the training data to the pca model. result = pca.fit(train_vectors) #Transform the original data pca_train_vectors = pca.transform(train_vectors) pca_test_vectors = pca.transform(test_vectors) pca.explained_variance_ratio_ #Returns a list of the components and there represented variance
print("Fitting the classifier to the training set") param_grid = {'C': [.001, .005, .0001, .0005, .00001], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='linear', class_weight='balanced'), param_grid, n_jobs=-1) result = clf.fit(train_vectors, train_labels)