-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml_test.pl
More file actions
122 lines (93 loc) · 3.46 KB
/
Copy pathhtml_test.pl
File metadata and controls
122 lines (93 loc) · 3.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/perl -w
use strict;
use HTML::TreeBuilder 3; # make sure our version isn't ancient
use LWP::Simple;
#Gets the text content of a post
sub getText{
my $finalString = "";
my @array = $_[0]->find_by_attribute("class","js-tweet-text tweet-text");
foreach(@array){
$finalString .= $_->as_text();
}
return $finalString;
}
# Gets the poster user name of a block.
sub getPosterUserName{
#Gets the tags within the html that handle twitter links.
my @twitterLinks = $_[0]->find_by_tag_name("s","b");
my $finalString = "";
my $isSearch = 0;
my $continueLoop = 1;
my $counter = 0;
while($continueLoop){
my $string = $twitterLinks[$counter]->as_text();
if($string eq "@") {
$isSearch = 1;
}elsif($isSearch){
$finalString = $string;
$continueLoop = 0;
}
$counter++;
}
return $finalString;
}
sub getAtMessages{
#Gets the tags within the html that handle twitter links.
my @twitterLinks = $_[0]->find_by_tag_name("s","b");
my $finalString = "";
my $isSearch = 0;
my $atCount = 0; #If 1, it is the post, and we ignore. Anything after the first user name is user.
foreach(@twitterLinks){
my $string = $_->as_text();
if($string eq "@") {
$isSearch = 1;
$atCount++;
}elsif($atCount >= 2 && $isSearch){
$finalString .= $string . " ";
$isSearch = 0;
}
}
return $finalString;
}
sub getTimePosted{
my @twitterLinks = $_[0]->find_by_attribute("class","tweet-timestamp js-permalink js-nav js-tooltip");
my $finalString = $twitterLinks[0]->attr("title");
return $finalString;
}
#Gets the list of hashtags involved in a post
sub getTwitterLinks{
#Handles arguements. First argument is the filter (either @ or #) that we asrte search for.
#Second gets the tags within the html that handle twitter links.
my $search = $_[0];
my @twitterLinks = $_[1]->find_by_tag_name("s","b");
my $finalString = "";
my $isSearch = 0;
foreach(@twitterLinks){
my $string = $_->as_text();;
if($string eq $search) {
$isSearch = 1;
}elsif($isSearch){
$finalString = $finalString .= $string . " ";
$isSearch = 0;
}
}
return $finalString;
}
my $webpage = "https://twitter.com/hashtag/ostrich";
my $pageHTML = get $webpage;
my $root = HTML::TreeBuilder->new;
$root->parse($pageHTML);
#$root->parse_file("report.txt");
$root->eof( ); # done parsing for this tree
#$root->dump; # print( ) a representation of the tree
#my @array = $root->find_by_attribute("class","js-tweet-text tweet-text");
my @array = $root->find_by_attribute("class","content");
#$array[1]->dump;
open(my $fh, ">", "report.txt");
#Titles
print $fh "HashTags|Tweet|Tweeters UserName|Tweeted to:\n";
foreach(@array){
print $fh getTwitterLinks("#",$_), "|",getText($_), "|", getPosterUserName($_), "|", getAtMessages($_), "|", getTimePosted($_), "\n";
}
close $fh;
$root->delete; # erase this tree because we're done with it