]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | #!/usr/bin/awk -f |
2 | ||
3 | # | |
4 | # Version 1 | |
5 | # | |
6 | # This awk script takes two, similarly sorted lists and outputs | |
7 | # only the lines which exist in both lists. The script takes | |
8 | # three inputs: | |
9 | # | |
10 | # ./rgw-gap-list-comparator \ | |
11 | # -v filetwo=gap-list-B.txt \ | |
12 | # -v matchout=matched_lines.txt \ | |
13 | # gap-list-A.txt | |
14 | # | |
15 | ||
16 | function usage() { | |
17 | print "">>"/dev/stderr" | |
18 | print "">>"/dev/stderr" | |
19 | print "The idea behind the script is to eliminate false positive hits">>"/dev/stderr" | |
20 | print "from the rgw-gap-list tool which are due to upload timing of new">>"/dev/stderr" | |
21 | print "objects during the tool's execution. To use the tool properly,">>"/dev/stderr" | |
22 | print "the following process should be followed:">>"/dev/stderr" | |
23 | print "">>"/dev/stderr" | |
24 | print "">>"/dev/stderr" | |
25 | print " 1: Run the 'rgw-gap-list' tool twice">>"/dev/stderr" | |
26 | print "">>"/dev/stderr" | |
27 | print " 2: Sort the resulting map files:">>"/dev/stderr" | |
28 | print " $ export LC_ALL=C">>"/dev/stderr" | |
29 | print " $ sort gap-list-A.gap > gap-list-A.sorted.gap">>"/dev/stderr" | |
30 | print " $ sort gap-list-B.gap > gap-list.B.sorted.gap">>"/dev/stderr" | |
31 | print " -- Where the A / B in the gap-list file names are the date/time associated with each of the respective 'rgw-gap-list' outputs">>"/dev/stderr" | |
32 | print "">>"/dev/stderr" | |
33 | print " 3: Run the 'same_lines_only.awk' script over the two files:">>"/dev/stderr" | |
34 | print " $ rm matched_lines.txt">>"/dev/stderr" | |
35 | print " $ ./rgw-gap-list-comparator -v filetwo=gap-list-B.sorted.gap -v matchout=matched_lines.txt gap-list-A.sorted.gap">>"/dev/stderr" | |
36 | print " -- Where the A / B in the gap-list file names are the date/time associated with each of the respective 'rgw-gap-list' outputs">>"/dev/stderr" | |
37 | print "">>"/dev/stderr" | |
38 | print " The resulting 'matched_lines.txt' will be a high confidence list of impacted objects with little to no false positives.">>"/dev/stderr" | |
39 | print "">>"/dev/stderr" | |
40 | print "">>"/dev/stderr" | |
41 | exit 1 | |
42 | } | |
43 | ||
44 | function advance_f2() { | |
45 | if ((getline f2line<filetwo) <= 0) { | |
46 | f2_eof=1 | |
47 | } else { | |
48 | f2_count++ | |
49 | } | |
50 | } | |
51 | ||
52 | function test_lines() { | |
53 | if($0==f2line) { | |
54 | print $0>>matchout | |
55 | lineoutcount++ | |
56 | advance_f2() | |
57 | return 0 | |
58 | } else if ($0>f2line) { | |
59 | return 2 | |
60 | } else { | |
61 | return 1 | |
62 | } | |
63 | } | |
64 | ||
65 | function status_out() { | |
66 | printf("%s % 17d\t% 17d\t% 12d\n",get_date_time(),f1_count,f2_count,lineoutcount)>>"/dev/stderr" | |
67 | } | |
68 | ||
69 | function get_date_time() { | |
70 | dtstr="date +%F\\ %T" | |
71 | dtstr | getline mydt | |
72 | close(dtstr) | |
73 | return mydt | |
74 | } | |
75 | ||
76 | BEGIN { | |
77 | if(filetwo==""||matchout=="") { | |
78 | print "">>"/dev/stderr" | |
79 | print "">>"/dev/stderr" | |
80 | print "Missing parameter." | |
81 | print "">>"/dev/stderr" | |
82 | print "">>"/dev/stderr" | |
83 | usage() | |
84 | } | |
85 | ||
86 | f1_count=0 | |
87 | f2_count=0 | |
88 | lineoutcount=0 | |
89 | f2_eof=0 | |
90 | statusevery=100000 | |
91 | advance_f2() | |
92 | printf("%s File 1 Line Count\tFile 2 Line Count\tPotentially Impacted Objects\n",get_date_time())>>"/dev/stderr" | |
93 | status_out() | |
94 | } | |
95 | ||
96 | ||
97 | { | |
98 | f1_count++ | |
99 | if(f2_eof==0) { | |
100 | if(test_lines()==2) { | |
101 | while($0>f2line && f2_eof==0) { | |
102 | advance_f2() | |
103 | } | |
104 | test_lines() | |
105 | } | |
106 | } else { | |
107 | exit 0 | |
108 | } | |
109 | if ((f1_count % statusevery)==0) { | |
110 | status_out() | |
111 | } | |
112 | } | |
113 | ||
114 | END { | |
115 | if(f1_count>0) { | |
116 | status_out() | |
117 | } | |
118 | } | |
119 |