-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_tweets.py
More file actions
114 lines (87 loc) · 2.81 KB
/
process_tweets.py
File metadata and controls
114 lines (87 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from string import ascii_letters
def clean_line(line):
OldLine = line
line = list(line);
OldLine = list(OldLine);
Counter = 0;
Erased = 0;
while (Counter < len(OldLine)):
if OldLine[Counter] in ascii_letters:
line[Counter-Erased] = OldLine[Counter].lower();
else:
if (OldLine[Counter] != " "):
if(OldLine[Counter] != "@"):
if(OldLine[Counter] != "#"):
del(line[Counter-Erased]);
Erased += 1;
Counter += 1;
line = "".join(line);
return line
def get_tweet_text(line):
line = line.split(",");
line = line[1];
return line
def read_stopwords():
PieceOfPaper = open("stopwords.txt", "r");
Lines = [];
Lines = PieceOfPaper.readlines();
CorrectedLines = [];
Counter = 0;
while(Counter < len(Lines)):
CorrectedLines.append(Lines[Counter].replace("\n", ""));
Counter += 1;
PieceOfPaper.close();
return CorrectedLines
def process_tweet_text(text):
stopwords = read_stopwords()
words = clean_line(text).split()
result = []
wordsGuide = clean_line(text).split();
Counter0 = 0;
Erased = 0;
while (Counter0 < len(wordsGuide)):
Counter1 = 0;
while (Counter1 < len(stopwords)):
if wordsGuide[Counter0] == stopwords[Counter1]:
del(words[Counter0-Erased]);
Erased += 1;
Counter1 += 1;
Counter0 += 1;
result = words;
return result
def process_tweet_file(file_name):
word_freqs = {}
with open(file_name, encoding='utf-8') as tweets:
for line in tweets:
text = get_tweet_text(line)
words = process_tweet_text(text)
for word in words:
if word not in word_freqs:
word_freqs[word] = 1;
else:
word_freqs[word] += 1;
return word_freqs
def print_statistics(word_freqs):
Cardinal = 0;
for word in word_freqs:
Cardinal += 1;
Last = "nope";
LastFreq = 0;
for word in word_freqs:
if word_freqs[word] > LastFreq:
Last = word;
LastFreq = word_freqs[word];
print('The total number of words is:', str(Cardinal));
print('The total number of different words is:',
str(len(word_freqs)));
print('The most frequent word is:', Last);
print('With a frequency of:', str(word_freqs[Last]));
def write_words(word_freqs, file_name):
FreqsSheet = open("words.txt", "w", encoding="utf-8");
for word in word_freqs:
DownAllTheDays = str(word)+" "+str(word_freqs[word])+"\n";
FreqsSheet.write(DownAllTheDays);
FreqsSheet.close();
wf = process_tweet_file('tweets.txt')
print_statistics(wf)
write_words(wf, 'words.txt')