working wordcloud and timeline
This commit is contained in:
parent
4ff61e339e
commit
ed7a9eed80
|
|
@ -191,3 +191,6 @@ poetry.toml
|
|||
|
||||
# Custom
|
||||
*.json
|
||||
*.csv
|
||||
*.png
|
||||
!mask.png
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 54 KiB |
|
|
@ -0,0 +1,57 @@
|
|||
a
|
||||
át
|
||||
az
|
||||
be
|
||||
csak
|
||||
de
|
||||
egy
|
||||
el
|
||||
én
|
||||
és
|
||||
fel
|
||||
hát
|
||||
hogy
|
||||
ide
|
||||
igen
|
||||
ki
|
||||
le
|
||||
lesz
|
||||
meg
|
||||
mi
|
||||
mint
|
||||
nem
|
||||
ő
|
||||
oda
|
||||
ők
|
||||
ön
|
||||
össze
|
||||
rá
|
||||
szét
|
||||
te
|
||||
ti
|
||||
vagy
|
||||
van
|
||||
vissza
|
||||
volt
|
||||
h
|
||||
is
|
||||
ha
|
||||
ez
|
||||
mert
|
||||
már
|
||||
akkor
|
||||
még
|
||||
most
|
||||
jó
|
||||
azt
|
||||
:
|
||||
.
|
||||
?
|
||||
!
|
||||
,
|
||||
;
|
||||
(
|
||||
)
|
||||
d
|
||||
:-)
|
||||
:d
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from participant import Participant
|
||||
import ftfy
|
||||
import analyzing
|
||||
|
||||
|
||||
def read_json(filename):
|
||||
with open(filename, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def main():
|
||||
participants = []
|
||||
file_list = list(sys.argv[1:])
|
||||
|
||||
chat_data = read_json(file_list[0])
|
||||
|
||||
participant_count = len(chat_data["participants"])
|
||||
|
||||
for i in range(participant_count):
|
||||
participants.append(Participant(chat_data["participants"][i]["name"]))
|
||||
|
||||
for f in file_list:
|
||||
chat_data = read_json(f)
|
||||
|
||||
for m in chat_data["messages"]:
|
||||
for p in participants:
|
||||
if m["type"] == "Generic" and "content" in m:
|
||||
if ftfy.ftfy(m["sender_name"]) == p.name:
|
||||
p.add_message(m["timestamp_ms"], m["content"])
|
||||
|
||||
CHAT_START = datetime.fromtimestamp(chat_data["messages"][-1]["timestamp_ms"]/1000)
|
||||
|
||||
WORDS_IN_CHAT = []
|
||||
for p in participants:
|
||||
WORDS_IN_CHAT.extend(p.get_words(longer_than=0))
|
||||
|
||||
print(f"This chat started: {CHAT_START}\n")
|
||||
print("The participants of this chat:")
|
||||
for p in participants:
|
||||
print(f"{p.name}\n")
|
||||
|
||||
#analyzing.make_wordcloud(WORDS_IN_CHAT)
|
||||
analyzing.make_timeline(participants)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
from wordcloud import WordCloud
|
||||
import matplotlib.pyplot as plt
|
||||
from stopwords import HUNGARIAN_STOPWORDS, clean_text
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from os import path
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
import plotly.graph_objs as go
|
||||
from participant import Participant
|
||||
|
||||
|
||||
def incidents_of_words(words):
|
||||
word_set = set(words)
|
||||
|
||||
d = {}
|
||||
|
||||
for word in word_set:
|
||||
d[word] = float(words.count(word))
|
||||
|
||||
return d
|
||||
|
||||
|
||||
def create_dataframe(participants):
|
||||
# m_df = {}
|
||||
# sender = []
|
||||
# date = []
|
||||
# message = []
|
||||
skeleton = {"month": [],
|
||||
"counts": [],
|
||||
"sender": []}
|
||||
df = pd.DataFrame(skeleton)
|
||||
for p in participants:
|
||||
date=[]
|
||||
date += (p.messages.keys())
|
||||
# print(len(date))
|
||||
message=[]
|
||||
message += (p.messages.values())
|
||||
# print(len(message))
|
||||
|
||||
m_di = {"date": date,
|
||||
"message": message}
|
||||
|
||||
m_df = pd.DataFrame(m_di)
|
||||
|
||||
|
||||
by_month = pd.to_datetime(m_df['date']).dt.to_period('M').value_counts().sort_index()
|
||||
by_month.index = pd.PeriodIndex(by_month.index).to_timestamp()
|
||||
df_month = by_month.rename_axis('month').reset_index(name='counts')
|
||||
|
||||
sender=[]
|
||||
sender += [p.name]*df_month.count()[0]
|
||||
|
||||
df_month["sender"] = sender
|
||||
# gfg_csv_data = pd.DataFrame(df_month).to_csv('GfGbbbbb.csv', index = True)
|
||||
|
||||
df = pd.concat([df, df_month])
|
||||
|
||||
|
||||
# gfg_csv_data = pd.DataFrame(df).to_csv('GfGaaaa.csv', index = True)
|
||||
|
||||
return pd.DataFrame(df)
|
||||
|
||||
|
||||
def make_wordcloud(WORDS_IN_CHAT):
|
||||
# For some reason PIL doesn't work with relative path so I have to use absolute paths
|
||||
resource_location = f"{path.abspath('Resources')}"
|
||||
img_mask = np.array(Image.open(f"{resource_location}/mask.png"))
|
||||
wordcloud = WordCloud(width = 2000, height = 2000,
|
||||
background_color = "white",
|
||||
mode="RGBA",
|
||||
prefer_horizontal=1,
|
||||
mask = img_mask,
|
||||
# relative_scaling=1,
|
||||
# font_step=1,
|
||||
# min_font_size=2,
|
||||
max_words=600,
|
||||
include_numbers=True
|
||||
).generate_from_frequencies(frequencies=clean_text(incidents_of_words(WORDS_IN_CHAT), HUNGARIAN_STOPWORDS))
|
||||
|
||||
# plot the WordCloud image
|
||||
plt.figure(figsize = (10, 10), facecolor = 'k')
|
||||
plt.imshow(wordcloud, interpolation="bilinear")
|
||||
plt.axis("off")
|
||||
plt.tight_layout(pad = 0)
|
||||
|
||||
wordcloud.to_file(f"{resource_location}/wordcloud.png")
|
||||
|
||||
|
||||
def make_timeline(participants):
|
||||
df = create_dataframe(participants)
|
||||
|
||||
# by_month = pd.to_datetime(df['date']).dt.to_period('M').value_counts().sort_index()
|
||||
# by_month.index = pd.PeriodIndex(by_month.index)
|
||||
# df_month = by_month.rename_axis('month').reset_index(name='counts')
|
||||
|
||||
# fig = go.Figure(data=go.Scatter(x=df_month['month'].astype(dtype=str),
|
||||
# y=df_month['counts'],
|
||||
# marker_color='indianred', text="counts"))
|
||||
# fig = go.Figure()
|
||||
# for sender, group in df.groupby("sender"):
|
||||
# fig.add_trace(go.Scatter(x=group["month"].to_list(), y=group["counts"].to_list(), name=sender))
|
||||
# fig = px.line(df_month, x='month', y='counts')
|
||||
|
||||
fig = px.line(df, x="month", y="counts", color="sender")
|
||||
fig.update_layout({ "showlegend": False,
|
||||
"title": {"text": "Timeline of Messages",
|
||||
"xanchor": "center",
|
||||
"x": 0.5},
|
||||
"xaxis": {"showgrid": False,
|
||||
"title": ""},
|
||||
"yaxis": {"gridcolor": "white",
|
||||
"nticks": 2,
|
||||
"title": ""},
|
||||
"paper_bgcolor": 'rgba(0,0,0,0)',
|
||||
"plot_bgcolor": 'rgba(0,0,0,0)'
|
||||
})
|
||||
# fig.show()
|
||||
fig.write_image("by-month.png",format="png", width=1500, height=600, scale=3)
|
||||
|
||||
# saving the DataFrame as a CSV file
|
||||
# gfg_csv_data = df.to_csv('GfG.csv', index = True)
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
import ftfy
|
||||
from datetime import datetime
|
||||
import string
|
||||
|
||||
|
||||
def remove_punctuations(s):
|
||||
return s.translate(str.maketrans('', '', string.punctuation))
|
||||
|
||||
|
||||
class Participant:
|
||||
def __init__(self, name):
|
||||
self.name = ftfy.ftfy(name)
|
||||
self.messages = {}
|
||||
|
||||
def add_message(self, timestamp, message):
|
||||
self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message)
|
||||
|
||||
def get_words(self, longer_than=0):
|
||||
words = []
|
||||
|
||||
for m in self.messages.values():
|
||||
for w in m.split(' '):
|
||||
if len(w) > longer_than:
|
||||
words.append(remove_punctuations(w.upper()))
|
||||
|
||||
return words
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
import os
|
||||
import emoji
|
||||
|
||||
|
||||
def clean_text(d, s):
|
||||
for w in s:
|
||||
d.pop(w.upper(), None)
|
||||
|
||||
keys = list(d.keys())
|
||||
|
||||
for k in keys:
|
||||
if emoji.is_emoji(k):
|
||||
d.pop(k, None)
|
||||
|
||||
return d
|
||||
|
||||
|
||||
FILE = os.path.dirname(__file__)
|
||||
HUNGARIAN_STOPWORDS = set(map(str.strip, open(os.path.join(FILE, '../Resources/stop_words_hungarian.txt')).readlines()))
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
ftfy
|
||||
matplotlib
|
||||
pandas
|
||||
wordcloud
|
||||
numpy
|
||||
plotly
|
||||
kaleido
|
||||
Loading…
Reference in New Issue