Python common syntax for data mining

Built-in / Numpy/ Pandas / Matplotlib Syntax

--

Built-in function

variables

#tuple
num1, num2, num3 = nums
nums = (num1, num2, num3)
#declare global variable in the function
global variable_name

enumerate()

#one line command extract index & values in for-loop
for counter, value in enumerate(list):
print(counter + value)
enumerate(list, 1) #set the first index to 1

loop over list

#list of lists
for key, index in list:
#dictionary
for key, index in dict.items():
#np array
for key in np.nditer(numpy_array):
#dataframe
for key, index in dataframe.iterrows(): #all the data will be stored into index variable

Numpy

Operational Operators

#operational operators don't work well with np variables
np.logical_and(condition1, condition2)
np.logical_or(condition1, condition2)
np.logical_not(condition1, condition2)

Random number

#sets the random seed
np.random.seed(#)
#generate a random number
np.random.rand()
np.random.randint(1, 7) #int from 1 to 6

Matrix

#transpose
np.transpose(numpy_array)
#generate a random number
np.random.rand()
np.random.randint(1, 7) #int from 1 to 6

Pandas

DataFrame

pd.DataFrame(dict)#change the labels of rows
dataframe.index = row_labels
#select the column
#the single bracket version gives a Pandas Series
#the double bracket version gives a Pandas DataFrame
dataframe['column']
dataframe[['column']]
#slice of DataFrame: >x, <=y
cars[x:y]
#select particular value
dataframe.loc['key']
dataframe.iloc[index]
dataframe.iloc[[1,2],[3,4]] #columns 2,3 + rows 4,5
#update one column with another one
dataframe["column"] = dataframe["column2"].apply(str)
dataframe["column"] = dataframe["column2"].apply(str.upper)
dataframe["column"] = dataframe["column2"].apply(len)

Read external file

#csv
pf.read_csv("filename.csv")
arguments:
#the first column is used as row labels
index_col = 0

Matplotlib

Canvas

#show the chart
plt.show()
#clean the canvas
plt.clf()

General Settings

#convert into the log unit
plt.xscale = ‘log'
#chart title
plt.title(xxx)
#axis labels
plt.xlabel(xxx)
#axis ticks
plt.xticks(original_ticks, updated_ticks)

Histogram

plt.hist(data)arguments:
#define # of bins
bins = #

Scatter plot

plt.scatter(x_data, y_data)arguments:
#set the 3rd axis
s = z_data

--

--