working wordcloud and timeline

This commit is contained in:
Márton Kónya 2022-11-26 16:38:20 +01:00
parent 4ff61e339e
commit ed7a9eed80
9 changed files with 287 additions and 1 deletions

3
.gitignore vendored
View File

@ -191,3 +191,6 @@ poetry.toml
# Custom
*.json
*.csv
*.png
!mask.png

BIN
Resources/mask.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

View File

@ -0,0 +1,57 @@
a
át
az
be
csak
de
egy
el
én
és
fel
hát
hogy
ide
igen
ki
le
lesz
meg
mi
mint
nem
ő
oda
ők
ön
össze
szét
te
ti
vagy
van
vissza
volt
h
is
ha
ez
mert
már
akkor
még
most
azt
:
.
?
!
,
;
(
)
d
:-)
:d

0
chatanalyzer/__init__.py Normal file
View File

52
chatanalyzer/__main__.py Normal file
View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
import json
import sys
from datetime import datetime
from participant import Participant
import ftfy
import analyzing
def read_json(filename):
with open(filename, 'r') as f:
return json.load(f)
def main():
participants = []
file_list = list(sys.argv[1:])
chat_data = read_json(file_list[0])
participant_count = len(chat_data["participants"])
for i in range(participant_count):
participants.append(Participant(chat_data["participants"][i]["name"]))
for f in file_list:
chat_data = read_json(f)
for m in chat_data["messages"]:
for p in participants:
if m["type"] == "Generic" and "content" in m:
if ftfy.ftfy(m["sender_name"]) == p.name:
p.add_message(m["timestamp_ms"], m["content"])
CHAT_START = datetime.fromtimestamp(chat_data["messages"][-1]["timestamp_ms"]/1000)
WORDS_IN_CHAT = []
for p in participants:
WORDS_IN_CHAT.extend(p.get_words(longer_than=0))
print(f"This chat started: {CHAT_START}\n")
print("The participants of this chat:")
for p in participants:
print(f"{p.name}\n")
#analyzing.make_wordcloud(WORDS_IN_CHAT)
analyzing.make_timeline(participants)
if __name__ == "__main__":
main()

122
chatanalyzer/analyzing.py Normal file
View File

@ -0,0 +1,122 @@
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from stopwords import HUNGARIAN_STOPWORDS, clean_text
import numpy as np
from PIL import Image
from os import path
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from participant import Participant
def incidents_of_words(words):
word_set = set(words)
d = {}
for word in word_set:
d[word] = float(words.count(word))
return d
def create_dataframe(participants):
# m_df = {}
# sender = []
# date = []
# message = []
skeleton = {"month": [],
"counts": [],
"sender": []}
df = pd.DataFrame(skeleton)
for p in participants:
date=[]
date += (p.messages.keys())
# print(len(date))
message=[]
message += (p.messages.values())
# print(len(message))
m_di = {"date": date,
"message": message}
m_df = pd.DataFrame(m_di)
by_month = pd.to_datetime(m_df['date']).dt.to_period('M').value_counts().sort_index()
by_month.index = pd.PeriodIndex(by_month.index).to_timestamp()
df_month = by_month.rename_axis('month').reset_index(name='counts')
sender=[]
sender += [p.name]*df_month.count()[0]
df_month["sender"] = sender
# gfg_csv_data = pd.DataFrame(df_month).to_csv('GfGbbbbb.csv', index = True)
df = pd.concat([df, df_month])
# gfg_csv_data = pd.DataFrame(df).to_csv('GfGaaaa.csv', index = True)
return pd.DataFrame(df)
def make_wordcloud(WORDS_IN_CHAT):
# For some reason PIL doesn't work with relative path so I have to use absolute paths
resource_location = f"{path.abspath('Resources')}"
img_mask = np.array(Image.open(f"{resource_location}/mask.png"))
wordcloud = WordCloud(width = 2000, height = 2000,
background_color = "white",
mode="RGBA",
prefer_horizontal=1,
mask = img_mask,
# relative_scaling=1,
# font_step=1,
# min_font_size=2,
max_words=600,
include_numbers=True
).generate_from_frequencies(frequencies=clean_text(incidents_of_words(WORDS_IN_CHAT), HUNGARIAN_STOPWORDS))
# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = 'k')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad = 0)
wordcloud.to_file(f"{resource_location}/wordcloud.png")
def make_timeline(participants):
df = create_dataframe(participants)
# by_month = pd.to_datetime(df['date']).dt.to_period('M').value_counts().sort_index()
# by_month.index = pd.PeriodIndex(by_month.index)
# df_month = by_month.rename_axis('month').reset_index(name='counts')
# fig = go.Figure(data=go.Scatter(x=df_month['month'].astype(dtype=str),
# y=df_month['counts'],
# marker_color='indianred', text="counts"))
# fig = go.Figure()
# for sender, group in df.groupby("sender"):
# fig.add_trace(go.Scatter(x=group["month"].to_list(), y=group["counts"].to_list(), name=sender))
# fig = px.line(df_month, x='month', y='counts')
fig = px.line(df, x="month", y="counts", color="sender")
fig.update_layout({ "showlegend": False,
"title": {"text": "Timeline of Messages",
"xanchor": "center",
"x": 0.5},
"xaxis": {"showgrid": False,
"title": ""},
"yaxis": {"gridcolor": "white",
"nticks": 2,
"title": ""},
"paper_bgcolor": 'rgba(0,0,0,0)',
"plot_bgcolor": 'rgba(0,0,0,0)'
})
# fig.show()
fig.write_image("by-month.png",format="png", width=1500, height=600, scale=3)
# saving the DataFrame as a CSV file
# gfg_csv_data = df.to_csv('GfG.csv', index = True)

View File

@ -0,0 +1,26 @@
import ftfy
from datetime import datetime
import string
def remove_punctuations(s):
return s.translate(str.maketrans('', '', string.punctuation))
class Participant:
def __init__(self, name):
self.name = ftfy.ftfy(name)
self.messages = {}
def add_message(self, timestamp, message):
self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message)
def get_words(self, longer_than=0):
words = []
for m in self.messages.values():
for w in m.split(' '):
if len(w) > longer_than:
words.append(remove_punctuations(w.upper()))
return words

19
chatanalyzer/stopwords.py Normal file
View File

@ -0,0 +1,19 @@
import os
import emoji
def clean_text(d, s):
for w in s:
d.pop(w.upper(), None)
keys = list(d.keys())
for k in keys:
if emoji.is_emoji(k):
d.pop(k, None)
return d
FILE = os.path.dirname(__file__)
HUNGARIAN_STOPWORDS = set(map(str.strip, open(os.path.join(FILE, '../Resources/stop_words_hungarian.txt')).readlines()))

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
ftfy
matplotlib
pandas
wordcloud
numpy
plotly
kaleido