Installation
# Start & enter <instance-2># Login as root using:
sudo su -# Change the local directory to temp directory:
cd /tmp# Now download the appropriate yum repo and add to CentOS 7 repo
# list:
wget https://repo.mysql.com/mysql80-community-release-el7-1.noarch.rpm# Let's install the repo to list:
yum localinstall mysql80-community-release-el7–1.noarch.rpm# Quick check:
yum repolist enabled | grep “mysql.*”# Let's install the latest mysql community server version i.e.
# essentials — client, server, common, libs:
yum install mysql-community-server
Services Startup
service mysqld start
service mysqld status# Quick version check:
mysql — version
Post Installation — very very important
# MySQL creates a temporary password under /var/log/mysqld.log.
# Head over to the above log file:
cd /var/log
Open with cat and save the temporary password in notepad, else you’re going to regret it
# Now let's run the secure installation command:
mysql_secure_installation# Enter password for user root:
<enter the temporary password here># Enter a new password here:
<enter your new password here># Change password for root:
n# Remove anonymous users:
y# Disallow root login remotely:
y# Remove test database and access to it:
y# Reload privilege tables now:
y
Log In
mysql -u root -p
Sample Data
Details on the data are in the footnote of this post
# Creating a new folder in <instance-2>:
mkdir /root/flightdatasets# I've uploaded a cleansed version of the data. Let's download that
# directly:
wget https://github.com/pratnimk/sqoop-big-data-analysis/raw/master/915662529_T_ONTIME_REPORTING.zip# Let's unzip now:
unzip 915662529_T_ONTIME_REPORTING.zip# Login into mysql:
mysql -u root -p
<enter password here># To allow MySQL server and client to accept data imports:
SET GLOBAL local_infile=1;
exit# Re-login:
mysql --local-infile=1 -u root -p
<enter password here># Creating and using new database:
create database flight_data
use flight_data# Create a table:
CREATE TABLE flightrepdata
(FL_DATE DATE,
OP_UNIQUE_CARRIER VARCHAR(10),
ORIGIN_AIRPORT_ID INT,
ORIGIN_AIRPORT_SEQ_ID INT,
ORIGIN_CITY_MARKET_ID INT,
ORIGIN_CITY_NAME VARCHAR(300),
DEST_AIRPORT_ID INT,
DEST_AIRPORT_SEQ_ID INT,
DEST_CITY_MARKET_ID INT,
DEST_CITY_NAME VARCHAR(300),
DEP_TIME INT,
ARR_TIME INT);# Now load the data into the table:
LOAD DATA LOCAL INFILE ‘/root/flightdatasets/915662529_T_ONTIME_REPORTING.csv’ INTO TABLE flightrepdata FIELDS TERMINATED BY ‘,’ ENCLOSED BY ‘“‘ LINES TERMINATED BY ‘\n’ IGNORE 1 ROWS (FL_DATE, OP_UNIQUE_CARRIER, ORIGIN_AIRPORT_ID, ORIGIN_AIRPORT_SEQ_ID, ORIGIN_CITY_MARKET_ID, ORIGIN_CITY_NAME, DEST_AIRPORT_ID, DEST_AIRPORT_SEQ_ID, DEST_CITY_MARKET_ID, DEST_CITY_NAME, DEP_TIME, ARR_TIME);
Voila, 60k+ records imported
# Let's add a unique primary index, otherwise downstream
# import/processing will be unnecessarily complicated
ALTER TABLE `flightrepdata`
ADD `uid` INT NOT NULL AUTO_INCREMENT PRIMARY KEY FIRST;
The Dataset
Please note, I didn’t download all of the columns to create the dataset. You can though.
The Homepage — https://www.bts.gov/
The data — https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time
More details on the fields can be found here. Explanation with some data? Here you go.