Read following important documentation about:
Get the data from this site.
Save your scripts in a folder inside the data folder, calling the script folder 'my_scripts' or whaterver. If 'my-scripts' is set as your current working directory, then the data files are available under this address '../[data file]', for instantce: '../geyser1.TAB'
import pandas as pd import matplotlib.pyplot as plt gys1 = pd.DataFrame(pd.read_csv('../geyser1.TAB', '\t')) g_int = gys1['Interval'] ax = plt.gca() ax.hist(g_int, bins=20, color='r') ax.set_xlabel('Intereruption time') ax.set_ylabel('Frequency') ax.set_title('Histogram') plt.show()
import matplotlib.pyplot as plt import pandas as pd gysr1_boxplot = pd.read_csv('.../geyser1.TAB', '\t') data_gysr1 = gysr1_boxplot['Interval'] plt.boxplot(data_gysr1) ax = plt.gca() ax.set_xlabel('222 cases') ax.set_ylabel('Interruption time ( minutes') ax.set_title('Box and Whisker Plot') plt.show()
AB: Put face- and edgecolor to change both of them. You can also have two different colors for the in- and outside of each dot.
import matplotlib.pyplot as plt import pandas as pd geysr1_scatterplot = pd.read_csv('.../geyser1.TAB', '\t') geysr1_data_Xax = geysr1_scatterplot['Duration'] geysr1_data_Yax = geysr1_scatterplot['Interval'] plt.scatter(geysr1_data_Xax, geysr1_data_Yax, facecolor='y', edgecolor='y') ax = plt.gca() ax.set_xlabel('Eruption duration time (minutes)') ax.set_ylabel('Interuption time (minutes)') ax.set_title('Scatter Plot of INTERVAL vs DURATION') plt.show()
Note: try different examples, e.g. the whole population or only those where 'Duration' ⇐ 3, the whole dataframe
import pandas as pd gysr1 = pd.read_csv('../geyser1.tab', '\t') gysr1['Duration'][gysr1['Duration'] <= 3].describe()
Selecting rows in a dataframe: doc / example
import matplotlib.pyplot as plt import pandas as pd gysr1 = pd.read_csv('../geyser1.tab', '\t') gysr1_inf3 = gysr1.loc[gysr1['Duration'] <= 3] gysr1_sup3 = gysr1.loc[gysr1['Duration'] > 3] plt.boxplot([gysr1_inf3['Interval'],gysr1_sup3['Interval']], labels= ['inf3','sup3'])
import matplotlib.pyplot as plt import pandas as pd adopt_data = pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\\adopt.TAB', '\t') adopt1 = adopt_data['Visa91'] plt.boxplot(adopt1) ax = plt.gca() ax.set_title('Box and Whisker Plot') ax.set_xlabel('39 cases') ax.set_ylabel('Number of visas in 1991') plt.show()
import matplotlib.pyplot as plt import pandas as pd adopt_data = pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\\adopt.TAB', '\t') adopt1 = adopt_data['Visa91'] plt.hist(adopt1) plt.show()
don't find the way to do it
import pandas as pd import matplotlib.pyplot as plt adopt = pd.DataFrame(pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\\adopt.TAB', '\t')) adopt_loghist = adopt['Visa91'] #adopt_loghist.semilogx() --> was one of the possibilities ax = plt.gca() ax.hist(adopt_loghist, bins=10, plt.loglog(0.5,3.5), color='r') #put log=True instead, but you will get the log for the frequencies plt.gca().set_xscale("log") ax.set_xlabel('Log (Number of 1991 visas') ax.set_ylabel('Frequency') ax.set_title('Histogram') plt.show()
import matplotlib.pyplot as plt import pandas as pd adoption_scatterplot = pd.read_csv('...\adopt.TAB', '\t') adopt_data_Xax = adoption_scatterplot['Visa88'] adopt_data_Yax = adoption_scatterplot['Visa91'] plt.scatter(adopt_data_Xax, adopt_data_Yax, facecolor='y', edgecolor='y') ax = plt.gca() ax.set_xlabel('Number of Visas in 1988') ax.set_ylim([0,2700]) ax.set_xlim([0,5000]) ax.set_ylabel('Number of Visas in 1991') ax.set_title('ScatterPlot of Visa91 vs Visa88') plt.show()
import matplotlib.pyplot as plt import pandas as pd adoption_scatterplot = pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\\adopt.TAB', '\t') adopt_data_Xax = adoption_scatterplot['Visa91'] adopt_data_Yax = adoption_scatterplot['Visa92'] plt.scatter(adopt_data_Xax, adopt_data_Yax, facecolor='y', edgecolor='y') ax = plt.gca() ax.set_xlabel('Number of Visas in 1991') ax.set_ylim([0,1800]) ax.set_xlim([0,2700]) ax.set_ylabel('Number of Visas in 1992') ax.set_title('ScatterPlot of Visa92 vs Visa91') plt.show()
Modified the bins of the both histograms: The Histogram is reliable for the “Old faithful” geyser but not for the Adoption rates. The appearance of the histogram changes quite a lot by changing the bins.
import pandas as pd import matplotlib.pyplot as plt scatter_plot = pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\\prdq.TAB', '\t') productivity_Y = scatter_plot['Producti'] quality_X = scatter_plot['Quality'] plt.scatter(productivity_Y, quality_X, bins=20, colors='r') ax = plt.gca() ax.set_Xlabel('Assembly defects per 100 cars') ax.set_Ylabel('Hours per vehicle') ax.set_title('Scatter Plot of Productivity VS Quality') plt.show()
import pandas as pd import matplotlib.pyplot as plt scatter_plot = pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\\prdq.TAB', '\t') productivity_Y = scatter_plot['ProdJapn'] quality_X = scatter_plot['QualJapn'] plt.scatter(productivity_Y, quality_X, bins=20, colors='r') ax = plt.gca() ax.set_Xlabel('Assembly defects per 100 cars (Japanese origin)') ax.set_Ylabel('Hours per vehicle (Japanese origin') ax.set_title('Scatter Plot of PRODJAPN VS QUALJAPN') plt.show()
import pandas as pd import matplotlib.pyplot as plt scatter_plot = pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\\prdq.TAB', '\t') productivity_Y = scatter_plot['ProdNonJ'] quality_X = scatter_plot['QualNonJ'] plt.scatter(productivity_Y, quality_X, bins=20, colors='r') ax = plt.gca() ax.set_Xlabel('Assembly defects per 100 cars (non-Japanese origin)') ax.set_Ylabel('Hours per vehicle (non-Japanese origin') ax.set_title('Scatter Plot of PRODNONJ VS QUALNONJ') plt.show()
import pandas as pd import matplotlib.pyplot as plt scatter_plot = pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\\prdq.TAB', '\t') productivity_Y = scatter_plot['Producti'] quality_X = scatter_plot['Quality'] plt.scatter(productivity_Y, quality_X, bins=20, colors='r') ax = plt.gca() ax.set_Xlabel('Assembly defects per 100 cars') ax.set_Ylabel('Hours per vehicle') ax.set_title('Scatter Plot of PRODUCTIVITY VS QUALITY') plt.show()
It worked the first time but now it doesn't work again. Maybe again a windows error?
#1 import matplotlib.pyplot as plt import pandas as pd data_comparison = pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\prdq.TAB', '\t') non_japanese = data_comparison.loc[data_comparison['QualNonJ']] japanese = data_comparison.loc[data_comparison['QualJapn']] plt.boxplot([non_japanese['Quality'],japanese['Quality']], labels= ['Non-japanese','Japanese']) plt.show() #2 import matplotlib.pyplot as plt import pandas as pd data_comparison = pd.read_csv('D:\Python\Libri\A_Casebook_for_a_First_Course_in_Statistics_and_Data_Analysis_Datasets\Data\Tab\prdq.TAB', '\t') non_japanese = data_comparison.loc[data_comparison['ProdNonJ']] japanese = data_comparison.loc[data_comparison['ProdJapn']] plt.boxplot([non_japanese['Producti'],japanese['Producti']], labels= ['Non-japanese','Japanese']) plt.show()