Python-Tweet-Cloud/process_tweets.py at master · apetezible/Python-Tweet-Cloud · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

from string import ascii_letters


def clean_line(line):
    OldLine = line
    line = list(line);
    OldLine = list(OldLine);

    Counter = 0;
    Erased = 0;

    while (Counter < len(OldLine)):
      if OldLine[Counter] in ascii_letters:
        line[Counter-Erased] = OldLine[Counter].lower();
      else:
        if (OldLine[Counter] != " "):
          if(OldLine[Counter] != "@"):
            if(OldLine[Counter] != "#"):
              del(line[Counter-Erased]);
              Erased += 1;
      Counter += 1;


    line = "".join(line);
    return line


def get_tweet_text(line):
    line = line.split(",");
    line = line[1];
    return line


def read_stopwords():
    PieceOfPaper = open("stopwords.txt", "r");
    Lines = [];
    Lines = PieceOfPaper.readlines();

    CorrectedLines = [];

    Counter = 0;

    while(Counter < len(Lines)):
        CorrectedLines.append(Lines[Counter].replace("\n", ""));
        Counter += 1;

    PieceOfPaper.close();
    return CorrectedLines


def process_tweet_text(text):
    stopwords = read_stopwords()
    words = clean_line(text).split()
    result = []
    wordsGuide = clean_line(text).split();

    Counter0 = 0;
    Erased = 0;
    while (Counter0 < len(wordsGuide)):
        Counter1 = 0;
        while (Counter1 < len(stopwords)):
            if wordsGuide[Counter0] == stopwords[Counter1]:
                del(words[Counter0-Erased]);
                Erased += 1;
            Counter1 += 1;
        Counter0 += 1;
    result = words;

    return result


def process_tweet_file(file_name):
    word_freqs = {}
    with open(file_name, encoding='utf-8') as tweets:
        for line in tweets:
            text = get_tweet_text(line)
            words = process_tweet_text(text)
            for  word in words:
                if word not in word_freqs:
                    word_freqs[word] = 1;
                else:
                    word_freqs[word] += 1;
    return word_freqs


def print_statistics(word_freqs):
    Cardinal = 0;
    for  word in word_freqs:
                Cardinal += 1;
    Last = "nope";
    LastFreq = 0;
    for  word in word_freqs:
                if word_freqs[word] > LastFreq:
                    Last = word;
                    LastFreq = word_freqs[word];

    print('The total number of words is:', str(Cardinal));
    print('The total number of different words is:',
          str(len(word_freqs)));
    print('The most frequent word is:', Last);
    print('With a frequency of:', str(word_freqs[Last]));


def write_words(word_freqs, file_name):
    FreqsSheet = open("words.txt", "w", encoding="utf-8");
    for  word in word_freqs:
               DownAllTheDays = str(word)+" "+str(word_freqs[word])+"\n";
               FreqsSheet.write(DownAllTheDays);
    FreqsSheet.close();

wf = process_tweet_file('tweets.txt')
print_statistics(wf)
write_words(wf, 'words.txt')