Python common syntax for data mining
Built-in / Numpy/ Pandas / Matplotlib Syntax
Published in
2 min readOct 27, 2017
Built-in function
variables
#tuple
num1, num2, num3 = nums
nums = (num1, num2, num3)#declare global variable in the function
global variable_name
enumerate()
#one line command extract index & values in for-loop
for counter, value in enumerate(list):
print(counter + value)enumerate(list, 1) #set the first index to 1
loop over list
#list of lists
for key, index in list:#dictionary
for key, index in dict.items():#np array
for key in np.nditer(numpy_array):#dataframe
for key, index in dataframe.iterrows(): #all the data will be stored into index variable
Numpy
Operational Operators
#operational operators don't work well with np variables
np.logical_and(condition1, condition2)
np.logical_or(condition1, condition2)
np.logical_not(condition1, condition2)
Random number
#sets the random seed
np.random.seed(#)#generate a random number
np.random.rand()
np.random.randint(1, 7) #int from 1 to 6
Matrix
#transpose
np.transpose(numpy_array)#generate a random number
np.random.rand()
np.random.randint(1, 7) #int from 1 to 6
Pandas
DataFrame
pd.DataFrame(dict)#change the labels of rows
dataframe.index = row_labels#select the column
#the single bracket version gives a Pandas Series
#the double bracket version gives a Pandas DataFrame
dataframe['column']
dataframe[['column']]#slice of DataFrame: >x, <=y
cars[x:y]#select particular value
dataframe.loc['key']
dataframe.iloc[index]
dataframe.iloc[[1,2],[3,4]] #columns 2,3 + rows 4,5#update one column with another one
dataframe["column"] = dataframe["column2"].apply(str)
dataframe["column"] = dataframe["column2"].apply(str.upper)
dataframe["column"] = dataframe["column2"].apply(len)
Read external file
#csv
pf.read_csv("filename.csv")arguments:
#the first column is used as row labels
index_col = 0
Matplotlib
Canvas
#show the chart
plt.show()#clean the canvas
plt.clf()
General Settings
#convert into the log unit
plt.xscale = ‘log'#chart title
plt.title(xxx)#axis labels
plt.xlabel(xxx)#axis ticks
plt.xticks(original_ticks, updated_ticks)
Histogram
plt.hist(data)arguments:
#define # of bins
bins = #
Scatter plot
plt.scatter(x_data, y_data)arguments:
#set the 3rd axis
s = z_data