AWK: a powerful tool for data extraction

Vince
vswe
Published in
3 min readMay 18, 2020

AWK is a very powerful tool for data parsing and extraction. It’s built in most Unix-like operating systems.

Build-in Variables:

$0: entire record
$1: first field
$2: second field
FILENAME: input file name
length($1): len of first field
length: len of record, including separator.
NR: number of count in current records
NF: number of filed in current records
RS: define a record(row), default "\n"
FS: define a field(column), default "white space"
ORS: output record separator
OFS: output field separator

Syntax:

# inline style
$ awk '{print $0}' inputFile
# use pipe to pass stdout
$ ps | awk '{print $0}'
# with different separator
$ echo "1|2|3" | awk -F '|' '{print $1}'

Run awk script

$ awk -f script.awk inputFile

Script Pattern

#!/usr/bin/awk -fBEGIN {
# preprocessing
}
{
# main process
}
END {
# post-processing
}

Count lines (wc)

$ awk 'END {print NR}' input

Count words (including separator)

awk '{c += length} END {print "Sum:", c}' input 

Count words frequency

INPUT
a b c
b c d
c d e
f
g
OUTPUT
./script input | sort
1 a
1 e
1 f
1 g
2 b
2 d
3 c

script

#!/usr/bin/awk -f{
for (i=1; i<NF; i++)
words[tolower($i)]++
}
END {
for (i in words)
print words[i], i
}

Count words frequency with multi separators

INPUT
a b c
b,c,d
c,d,e
f,
g,
OUTPUT
./script input | sort
1 a
1 e
1 f
1 g
2 b
2 d
3 c

script

#!/usr/bin/awk -fBEGIN {
FS = [, ]
}
{
for (i=1; i<NF; i++)
if (length($i)) # bypass f, case
words[tolower($i)]++
}
END {
for (i in words)
print words[i], i
}

Reshape array

INPUT
1
2
3
4
5
6
7
8
9
OUTPUT
1 2 3
4 5 6
7 8 9

script (use printf decide to use ORS or OFS)

#!/usr/bin/awk -f{
for (i = 1; i < NF; i++)
printf "%s", %i (++count % 3 ? OFS: ORS)
}

Auto next line

INPUT1
2
3
4
5
6
7
8
9
OUTPUT
./script n=5 input
1,6
2,7
3,8
4,9
5

script

#!/usr/bin/awk -fBEGIN {
OFS = ","
}
{
for (i = 1; i < NF; i++)
data[count++] = $i
}
END {
for (i = 0; i < n; i++)
for (j = i; j < count; j += n)
printf "%s", data[j] (j + n < count? OFS : ORS)
}

Filter Lines

Input
123
456
789
Error: aaaaa
5
Output
123
456
789
5

script

#!/usr/bin/awk -f{
if (!($0 ~ "Error"))
print($0)
}

Inline 語法

awk '/pattern/ { actions }'# BEGIN 和 END 就只做一次;中間會基於符合 pattern 的每RS行都做一次 
awk
'BEGIN { actions }
/pattern/ { actions }
/pattern/ { actions }
{ actions }
END { actions }'

範例

# 把 I 開頭的第一個 word 都印出來
awk '/^I/ {print $1}' content.txt

I123
Iphone
# 把包含 avg= 的行都印出來
awk '{if ($0 ~ "avg=") print %0}' content.txt
# 印出 match 的最後一行,第一個括弧每行都做,END最後才做
awk '{if ($0 ~ "avg=") data[count++] = $0} END {print data[count-1]}' content.txt
# 改 FS 直接用 FS 找出第幾個
echo "abc=123" | awk 'BEGIN {FS="abc="} {if (NF > 1) print $2)}'
123

--

--