diff --git a/.gitignore b/.gitignore index afd785f..ad1dd73 100644 --- a/.gitignore +++ b/.gitignore @@ -190,4 +190,7 @@ poetry.toml # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode # Custom -*.json \ No newline at end of file +*.json +*.csv +*.png +!mask.png \ No newline at end of file diff --git a/Resources/mask.png b/Resources/mask.png new file mode 100644 index 0000000..b752c37 Binary files /dev/null and b/Resources/mask.png differ diff --git a/Resources/stop_words_hungarian.txt b/Resources/stop_words_hungarian.txt new file mode 100644 index 0000000..f236efa --- /dev/null +++ b/Resources/stop_words_hungarian.txt @@ -0,0 +1,57 @@ +a +át +az +be +csak +de +egy +el +én +és +fel +hát +hogy +ide +igen +ki +le +lesz +meg +mi +mint +nem +ő +oda +ők +ön +össze +rá +szét +te +ti +vagy +van +vissza +volt +h +is +ha +ez +mert +már +akkor +még +most +jó +azt +: +. +? +! +, +; +( +) +d +:-) +:d \ No newline at end of file diff --git a/chatanalyzer/__init__.py b/chatanalyzer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chatanalyzer/__main__.py b/chatanalyzer/__main__.py new file mode 100644 index 0000000..ee162ec --- /dev/null +++ b/chatanalyzer/__main__.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +import json +import sys +from datetime import datetime +from participant import Participant +import ftfy +import analyzing + + +def read_json(filename): + with open(filename, 'r') as f: + return json.load(f) + + +def main(): + participants = [] + file_list = list(sys.argv[1:]) + + chat_data = read_json(file_list[0]) + + participant_count = len(chat_data["participants"]) + + for i in range(participant_count): + participants.append(Participant(chat_data["participants"][i]["name"])) + + for f in file_list: + chat_data = read_json(f) + + for m in chat_data["messages"]: + for p in participants: + if m["type"] == "Generic" and "content" in m: + if ftfy.ftfy(m["sender_name"]) == p.name: + p.add_message(m["timestamp_ms"], m["content"]) + + CHAT_START = datetime.fromtimestamp(chat_data["messages"][-1]["timestamp_ms"]/1000) + + WORDS_IN_CHAT = [] + for p in participants: + WORDS_IN_CHAT.extend(p.get_words(longer_than=0)) + + print(f"This chat started: {CHAT_START}\n") + print("The participants of this chat:") + for p in participants: + print(f"{p.name}\n") + + #analyzing.make_wordcloud(WORDS_IN_CHAT) + analyzing.make_timeline(participants) + + +if __name__ == "__main__": + main() diff --git a/chatanalyzer/analyzing.py b/chatanalyzer/analyzing.py new file mode 100644 index 0000000..58f2333 --- /dev/null +++ b/chatanalyzer/analyzing.py @@ -0,0 +1,122 @@ +from wordcloud import WordCloud +import matplotlib.pyplot as plt +from stopwords import HUNGARIAN_STOPWORDS, clean_text +import numpy as np +from PIL import Image +from os import path +import pandas as pd +import plotly.express as px +import plotly.graph_objs as go +from participant import Participant + + +def incidents_of_words(words): + word_set = set(words) + + d = {} + + for word in word_set: + d[word] = float(words.count(word)) + + return d + + +def create_dataframe(participants): + # m_df = {} + # sender = [] + # date = [] + # message = [] + skeleton = {"month": [], + "counts": [], + "sender": []} + df = pd.DataFrame(skeleton) + for p in participants: + date=[] + date += (p.messages.keys()) + # print(len(date)) + message=[] + message += (p.messages.values()) + # print(len(message)) + + m_di = {"date": date, + "message": message} + + m_df = pd.DataFrame(m_di) + + + by_month = pd.to_datetime(m_df['date']).dt.to_period('M').value_counts().sort_index() + by_month.index = pd.PeriodIndex(by_month.index).to_timestamp() + df_month = by_month.rename_axis('month').reset_index(name='counts') + + sender=[] + sender += [p.name]*df_month.count()[0] + + df_month["sender"] = sender + # gfg_csv_data = pd.DataFrame(df_month).to_csv('GfGbbbbb.csv', index = True) + + df = pd.concat([df, df_month]) + + + # gfg_csv_data = pd.DataFrame(df).to_csv('GfGaaaa.csv', index = True) + + return pd.DataFrame(df) + + +def make_wordcloud(WORDS_IN_CHAT): + # For some reason PIL doesn't work with relative path so I have to use absolute paths + resource_location = f"{path.abspath('Resources')}" + img_mask = np.array(Image.open(f"{resource_location}/mask.png")) + wordcloud = WordCloud(width = 2000, height = 2000, + background_color = "white", + mode="RGBA", + prefer_horizontal=1, + mask = img_mask, + # relative_scaling=1, + # font_step=1, + # min_font_size=2, + max_words=600, + include_numbers=True + ).generate_from_frequencies(frequencies=clean_text(incidents_of_words(WORDS_IN_CHAT), HUNGARIAN_STOPWORDS)) + + # plot the WordCloud image + plt.figure(figsize = (10, 10), facecolor = 'k') + plt.imshow(wordcloud, interpolation="bilinear") + plt.axis("off") + plt.tight_layout(pad = 0) + + wordcloud.to_file(f"{resource_location}/wordcloud.png") + + +def make_timeline(participants): + df = create_dataframe(participants) + + # by_month = pd.to_datetime(df['date']).dt.to_period('M').value_counts().sort_index() + # by_month.index = pd.PeriodIndex(by_month.index) + # df_month = by_month.rename_axis('month').reset_index(name='counts') + + # fig = go.Figure(data=go.Scatter(x=df_month['month'].astype(dtype=str), + # y=df_month['counts'], + # marker_color='indianred', text="counts")) + # fig = go.Figure() + # for sender, group in df.groupby("sender"): + # fig.add_trace(go.Scatter(x=group["month"].to_list(), y=group["counts"].to_list(), name=sender)) + # fig = px.line(df_month, x='month', y='counts') + + fig = px.line(df, x="month", y="counts", color="sender") + fig.update_layout({ "showlegend": False, + "title": {"text": "Timeline of Messages", + "xanchor": "center", + "x": 0.5}, + "xaxis": {"showgrid": False, + "title": ""}, + "yaxis": {"gridcolor": "white", + "nticks": 2, + "title": ""}, + "paper_bgcolor": 'rgba(0,0,0,0)', + "plot_bgcolor": 'rgba(0,0,0,0)' + }) + # fig.show() + fig.write_image("by-month.png",format="png", width=1500, height=600, scale=3) + + # saving the DataFrame as a CSV file + # gfg_csv_data = df.to_csv('GfG.csv', index = True) diff --git a/chatanalyzer/participant.py b/chatanalyzer/participant.py new file mode 100644 index 0000000..020b36c --- /dev/null +++ b/chatanalyzer/participant.py @@ -0,0 +1,26 @@ +import ftfy +from datetime import datetime +import string + + +def remove_punctuations(s): + return s.translate(str.maketrans('', '', string.punctuation)) + + +class Participant: + def __init__(self, name): + self.name = ftfy.ftfy(name) + self.messages = {} + + def add_message(self, timestamp, message): + self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message) + + def get_words(self, longer_than=0): + words = [] + + for m in self.messages.values(): + for w in m.split(' '): + if len(w) > longer_than: + words.append(remove_punctuations(w.upper())) + + return words diff --git a/chatanalyzer/stopwords.py b/chatanalyzer/stopwords.py new file mode 100644 index 0000000..d67b819 --- /dev/null +++ b/chatanalyzer/stopwords.py @@ -0,0 +1,19 @@ +import os +import emoji + + +def clean_text(d, s): + for w in s: + d.pop(w.upper(), None) + + keys = list(d.keys()) + + for k in keys: + if emoji.is_emoji(k): + d.pop(k, None) + + return d + + +FILE = os.path.dirname(__file__) +HUNGARIAN_STOPWORDS = set(map(str.strip, open(os.path.join(FILE, '../Resources/stop_words_hungarian.txt')).readlines())) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0511c96 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +ftfy +matplotlib +pandas +wordcloud +numpy +plotly +kaleido \ No newline at end of file