working wordcloud and timeline

2022-11-26 16:38:20 +01:00 · 2022-11-26 16:38:20 +01:00 · ed7a9eed80
parent 4ff61e339e
commit ed7a9eed80
9 changed files with 287 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -191,3 +191,6 @@ poetry.toml

 # Custom
 *.json
+*.csv
+*.png
+!mask.png
--- a/Resources/mask.png
+++ b/Resources/mask.png
--- a/Resources/stop_words_hungarian.txt
+++ b/Resources/stop_words_hungarian.txt
@ -0,0 +1,57 @@
+a
+át
+az
+be
+csak
+de
+egy
+el
+én
+és
+fel
+hát
+hogy
+ide
+igen
+ki
+le
+lesz
+meg
+mi
+mint
+nem
+ő
+oda
+ők
+ön
+össze
+rá
+szét
+te
+ti
+vagy
+van
+vissza
+volt
+h
+is
+ha
+ez
+mert
+már
+akkor
+még
+most
+jó
+azt
+:
+.
+?
+!
+,
+;
+(
+)
+d
+:-)
+:d
--- a/chatanalyzer/init.py
+++ b/chatanalyzer/init.py
--- a/chatanalyzer/main.py
+++ b/chatanalyzer/main.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+import json
+import sys
+from datetime import datetime
+from participant import Participant
+import ftfy
+import analyzing
+
+
+def read_json(filename):
+    with open(filename, 'r') as f:
+        return json.load(f)
+
+
+def main():
+    participants = []
+    file_list = list(sys.argv[1:])
+
+    chat_data = read_json(file_list[0])
+
+    participant_count = len(chat_data["participants"])
+
+    for i in range(participant_count):
+        participants.append(Participant(chat_data["participants"][i]["name"]))
+
+    for f in file_list:
+        chat_data = read_json(f)
+
+        for m in chat_data["messages"]:
+            for p in participants:
+                if m["type"] == "Generic" and "content" in m:
+                    if ftfy.ftfy(m["sender_name"]) == p.name:
+                        p.add_message(m["timestamp_ms"], m["content"])
+
+    CHAT_START = datetime.fromtimestamp(chat_data["messages"][-1]["timestamp_ms"]/1000)
+    
+    WORDS_IN_CHAT = []
+    for p in participants:
+        WORDS_IN_CHAT.extend(p.get_words(longer_than=0))
+
+    print(f"This chat started: {CHAT_START}\n")
+    print("The participants of this chat:")
+    for p in participants:
+        print(f"{p.name}\n")
+
+    #analyzing.make_wordcloud(WORDS_IN_CHAT)
+    analyzing.make_timeline(participants)
+
+
+if __name__ == "__main__":
+    main()
--- a/chatanalyzer/analyzing.py
+++ b/chatanalyzer/analyzing.py
@ -0,0 +1,122 @@
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+from stopwords import HUNGARIAN_STOPWORDS, clean_text
+import numpy as np
+from PIL import Image
+from os import path
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objs as go
+from participant import Participant
+
+
+def incidents_of_words(words):
+    word_set = set(words)
+    
+    d = {}
+
+    for word in word_set:
+        d[word] = float(words.count(word))
+
+    return d
+
+
+def create_dataframe(participants):
+    # m_df = {}
+    # sender = []
+    # date = []
+    # message = []
+    skeleton = {"month": [],
+                "counts": [],
+                "sender": []}
+    df = pd.DataFrame(skeleton)
+    for p in participants:
+        date=[]
+        date += (p.messages.keys())
+        # print(len(date))
+        message=[]
+        message += (p.messages.values())
+        # print(len(message))
+
+        m_di = {"date": date,
+                "message": message}
+
+        m_df = pd.DataFrame(m_di)
+
+        
+        by_month = pd.to_datetime(m_df['date']).dt.to_period('M').value_counts().sort_index()
+        by_month.index = pd.PeriodIndex(by_month.index).to_timestamp()
+        df_month = by_month.rename_axis('month').reset_index(name='counts')
+
+        sender=[]
+        sender += [p.name]*df_month.count()[0]
+
+        df_month["sender"] = sender
+        # gfg_csv_data = pd.DataFrame(df_month).to_csv('GfGbbbbb.csv', index = True)
+        
+        df = pd.concat([df, df_month])
+
+
+    # gfg_csv_data = pd.DataFrame(df).to_csv('GfGaaaa.csv', index = True)
+    
+    return pd.DataFrame(df)
+
+
+def make_wordcloud(WORDS_IN_CHAT):
+    # For some reason PIL doesn't work with relative path so I have to use absolute paths
+    resource_location = f"{path.abspath('Resources')}"
+    img_mask = np.array(Image.open(f"{resource_location}/mask.png"))
+    wordcloud = WordCloud(width = 2000, height = 2000,
+                background_color = "white",
+                mode="RGBA",
+                prefer_horizontal=1,
+                mask = img_mask,
+                # relative_scaling=1,
+                # font_step=1,
+                # min_font_size=2,
+                max_words=600,
+                include_numbers=True
+                ).generate_from_frequencies(frequencies=clean_text(incidents_of_words(WORDS_IN_CHAT), HUNGARIAN_STOPWORDS))
+ 
+    # plot the WordCloud image                      
+    plt.figure(figsize = (10, 10), facecolor = 'k')
+    plt.imshow(wordcloud, interpolation="bilinear")
+    plt.axis("off")
+    plt.tight_layout(pad = 0)
+ 
+    wordcloud.to_file(f"{resource_location}/wordcloud.png")
+
+
+def make_timeline(participants):
+    df = create_dataframe(participants)
+
+    # by_month = pd.to_datetime(df['date']).dt.to_period('M').value_counts().sort_index()
+    # by_month.index = pd.PeriodIndex(by_month.index)
+    # df_month = by_month.rename_axis('month').reset_index(name='counts')
+
+    # fig = go.Figure(data=go.Scatter(x=df_month['month'].astype(dtype=str), 
+    #                 y=df_month['counts'],
+    #                 marker_color='indianred', text="counts"))
+    # fig = go.Figure()
+    # for sender, group in df.groupby("sender"):
+    #     fig.add_trace(go.Scatter(x=group["month"].to_list(), y=group["counts"].to_list(), name=sender))
+    # fig = px.line(df_month, x='month', y='counts')
+
+    fig = px.line(df, x="month", y="counts", color="sender")
+    fig.update_layout({ "showlegend": False,
+                        "title": {"text": "Timeline of Messages",
+                                "xanchor": "center",
+                                "x": 0.5},
+                        "xaxis": {"showgrid": False,
+                                    "title": ""},
+                        "yaxis": {"gridcolor": "white",
+                                    "nticks": 2,
+                                    "title": ""},
+                        "paper_bgcolor": 'rgba(0,0,0,0)',
+                        "plot_bgcolor": 'rgba(0,0,0,0)'
+                        })
+    # fig.show()
+    fig.write_image("by-month.png",format="png", width=1500, height=600, scale=3)
+   
+    # saving the DataFrame as a CSV file
+    # gfg_csv_data = df.to_csv('GfG.csv', index = True)
--- a/chatanalyzer/participant.py
+++ b/chatanalyzer/participant.py
@ -0,0 +1,26 @@
+import ftfy
+from datetime import datetime
+import string
+
+
+def remove_punctuations(s):
+    return s.translate(str.maketrans('', '', string.punctuation))
+
+
+class Participant:
+    def __init__(self, name):
+        self.name = ftfy.ftfy(name)
+        self.messages = {}
+    
+    def add_message(self, timestamp, message):
+        self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message)
+    
+    def get_words(self, longer_than=0):
+        words = []
+
+        for m in self.messages.values():
+            for w in m.split(' '):
+                if len(w) > longer_than:
+                    words.append(remove_punctuations(w.upper()))
+        
+        return words
--- a/chatanalyzer/stopwords.py
+++ b/chatanalyzer/stopwords.py
@ -0,0 +1,19 @@
+import os
+import emoji
+
+
+def clean_text(d, s):
+    for w in s:
+        d.pop(w.upper(), None)
+    
+    keys = list(d.keys())
+    
+    for k in keys:
+        if emoji.is_emoji(k):
+            d.pop(k, None)
+
+    return d
+
+
+FILE = os.path.dirname(__file__)
+HUNGARIAN_STOPWORDS = set(map(str.strip, open(os.path.join(FILE, '../Resources/stop_words_hungarian.txt')).readlines()))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,7 @@
+ftfy
+matplotlib
+pandas
+wordcloud
+numpy
+plotly
+kaleido