Scraping your webpage

Using Node.js to scrap you GitHub Pages webpage

Sparisoma Viridi
5 min readSep 27, 2023

It is assumed that you have installed Node.js to your computer and once made an application using it, e.g. a very simple web server. Steps from a tutorial (Radavicius, 2022) are given here after simplification and reduction with additional guidance.

Setup Node.js project

Create a folder for the project, which is projects/scrape in this case and navigate to it.

Initiate the project by typing npm init -y that will create package.json file.

D:\web2scrap\projects\scrape>npm init -y
Wrote to D:\web2scrap\projects\scrape\package.json:

{
"name": "scrape",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC"
}

This file will contain information about the packages that are installed in the folder.

Now the folder is ready.

Install Node.js packages

First install axios.

D:\web2scrap\projects\scrape>npm install axios

added 9 packages, and audited 10 packages in 6s

1 package is looking for funding
run `npm fund` for details

found 0 vulnerabilities
npm notice
npm notice New major version of npm available! 9.6.7 -> 10.1.0
npm notice Changelog: https://github.com/npm/cli/releases/tag/v10.1.0
npm notice Run npm install -g npm@10.1.0 to update!
npm notice

Install cheerio.

D:\web2scrap\projects\scrape>npm install cheerio

added 14 packages, and audited 24 packages in 2s

14 packages are looking for funding
run `npm fund` for details

found 0 vulnerabilities

Install json2csv.

D:\web2scrap\projects\scrape>npm install json2csv

added 4 packages, and audited 28 packages in 4s

14 packages are looking for funding
run `npm fund` for details

found 0 vulnerabilities

Now display the content of package.json file.

D:\web2scrap\projects\scrape>type package.json
{
"name": "scrape",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"axios": "^1.5.1",
"cheerio": "^1.0.0-rc.12",
"json2csv": "^6.0.0-alpha.2"
}
}

You can see that it has been updated every time npm install package-name is executed.

Prepare a webpage

Actually you can use any webpage, but since their structure might be unknown, it would be better to use our own webpage that we have designed.

It is available at https://dudung.github.io/web2scrap/library.html with following content.


<html>
<head>
<title>library</title>
<style>
// some style definitions
</style>
</head>
<body>
<div id="0" class="book">
<div class="title">Practical Node.js: Building Real-World Scalable Web Apps</div>
<div class="author">Mardan, Azat</div>
<div class="publisher">Apress</div>
<div class="edition">2</div>
<div class="year">2018</div>
<div class="isbn">9781484230398</div>
<div class="link">https://isbnsearch.org/isbn/9781484230398</div>
</div>

<!--
Other books
-->

<div id="9" class="book">
<div class="title">Node.js Web Development: Create real-time server-side applications with this practical, step-by-step guide, 3rd Edition</div>
<div class="author">Herron, David</div>
<div class="publisher">Packt Publishing</div>
<div class="edition">3</div>
<div class="year"></div>
<div class="isbn">9781785885419</div>
<div class="link">https://isbnsearch.org/isbn/9781785885419</div>
</div>
</body>
</html>

Using HTML tag as selector

According to content of target webpage element with div tag should be find as in following lines of code.

const cheerio = require("cheerio");
const axios = require("axios");

const url = "https://dudung.github.io/web2scrap/library.html";
const div_data = [];

async function getDiv() {
try {
const response = await axios.get(url);
const $ =cheerio.load(response.data);

const div = $("div");
div.each(function() {
subdiv = $(this).find(".title").text();

div_data.push({subdiv});
});

console.log(div_data);
}
catch(error) {
console.log(error);
}
}

getDiv();

Then it is executed as follow.

D:\web2scrap\projects\scrape>node div.js
[
{
subdiv: 'Practical Node.js: Building Real-World Scalable Web Apps'
},
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' },

..

{
subdiv: 'Node.js Web Development: Create real-time server-side applications with this practical, step-by-step guide, 3rd Edition'
},
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' },
{ subdiv: '' }
]

Since too many div elements it can differ a book and its elements. This way is not recommended.

Using style as selector

Noticing the content of target webpage, we would like to have element with class title, author, and year. Following lines of code is written.

const cheerio = require("cheerio");
const axios = require("axios");

const url = "https://dudung.github.io/web2scrap/library.html";
const book_data = [];

async function getBooks() {
try {
const response = await axios.get(url);
const $ =cheerio.load(response.data);

const books = $(".book");
books.each(function() {
title = $(this).find(".title").text();
author = $(this).find(".author").text();
year = $(this).find(".year").text();

book_data.push({title, author, year});
});

console.log(book_data);
}
catch(error) {
console.log(error);
}
}

getBooks();

Execute it with

D:\web2scrap\projects\scrape>node books.js

and get the result

[
{
title: 'Practical Node.js: Building Real-World Scalable Web Apps',
author: 'Mardan, Azat',
year: '2018'
},
{
title: 'Node.js for PHP Developers: Porting PHP to Node.js',
author: 'Howard, Daniel',
year: '2013'
},

..

{
title: 'Supercharging Node.js Applications with Sequelize: Create high-quality Node.js apps effortlessly while interacting with your SQL database',
author: 'Durante, Daniel',
year: '2022'
},
{
title: 'Node.js Web Development: Create real-time server-side applications with this practical, step-by-step guide, 3rd Edition',
author: 'Herron, David',
year: '2016'
}
]

Write results to file

Next is to save the result to file, a CSV file. Previous program is modified into and rename it as books_write2csv.js.

const cheerio = require("cheerio");
const axios = require("axios");
const j2cp = require("json2csv").Parser;
const fs = require("fs");

const url = "https://dudung.github.io/web2scrap/library.html";
const book_data = [];

async function getBooks() {
try {
const response = await axios.get(url);
const $ =cheerio.load(response.data);

const books = $(".book");
books.each(function() {
title = $(this).find(".title").text();
author = $(this).find(".author").text();
year = $(this).find(".year").text();

book_data.push({title, author, year});
});

//console.log(book_data);
const parser = new j2cp();
const csv = parser.parse(book_data);
fs.writeFileSync("./book_info.csv", csv);
}
catch(error) {
console.log(error);
}
}

getBooks();

Execute it

D:\web2scrap\projects\scrape>node books_write2csv.js

and it will produce books.csv file.

Display its content.

D:\web2scrap\projects\scrape>type book_info.csv
"title","author","year"
"Practical Node.js: Building Real-World Scalable Web Apps","Mardan, Azat","2018"
"Node.js for PHP Developers: Porting PHP to Node.js","Howard, Daniel","2013"
"Full Stack JavaScript: Learn Backbone.js, Node.js and MongoDB","Mardan, Azat","2015"
"Get Programming with Node.js","Wexler, Jonathan","2019"
"Building APIs with Node.js","Pereira, Caio Ribeiro","2016"
"Essential Node.js Security","Tal, Liran","2018"
"Node Cookbook: Discover solutions, techniques, and best practices for server-side web development with Node.js 14","Griggs, Bethany","2020"
"Building Scalable Apps with Redis and Node.js","Johanan, Joshua","2014"
"Supercharging Node.js Applications with Sequelize: Create high-quality Node.js apps effortlessly while interacting with your SQL database","Durante, Daniel","2022"
"Node.js Web Development: Create real-time server-side applications with this practical, step-by-step guide, 3rd Edition","Herron, David","2016"
D:\web2scrap\projects\scrape>

Now you have a CSV file containing information about books from target website https://dudung.github.io/web2scrap/library.html.

--

--