working wordcloud and timeline
This commit is contained in:
		
							parent
							
								
									4ff61e339e
								
							
						
					
					
						commit
						ed7a9eed80
					
				|  | @ -191,3 +191,6 @@ poetry.toml | |||
| 
 | ||||
| # Custom | ||||
| *.json | ||||
| *.csv | ||||
| *.png | ||||
| !mask.png | ||||
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 54 KiB | 
|  | @ -0,0 +1,57 @@ | |||
| a | ||||
| át | ||||
| az | ||||
| be | ||||
| csak | ||||
| de | ||||
| egy | ||||
| el | ||||
| én | ||||
| és | ||||
| fel | ||||
| hát | ||||
| hogy | ||||
| ide | ||||
| igen | ||||
| ki | ||||
| le | ||||
| lesz | ||||
| meg | ||||
| mi | ||||
| mint | ||||
| nem | ||||
| ő | ||||
| oda | ||||
| ők | ||||
| ön | ||||
| össze | ||||
| rá | ||||
| szét | ||||
| te | ||||
| ti | ||||
| vagy | ||||
| van | ||||
| vissza | ||||
| volt | ||||
| h | ||||
| is | ||||
| ha | ||||
| ez | ||||
| mert | ||||
| már | ||||
| akkor | ||||
| még | ||||
| most | ||||
| jó | ||||
| azt | ||||
| : | ||||
| . | ||||
| ? | ||||
| ! | ||||
| , | ||||
| ; | ||||
| ( | ||||
| ) | ||||
| d | ||||
| :-) | ||||
| :d | ||||
|  | @ -0,0 +1,52 @@ | |||
| #!/usr/bin/env python | ||||
| 
 | ||||
| import json | ||||
| import sys | ||||
| from datetime import datetime | ||||
| from participant import Participant | ||||
| import ftfy | ||||
| import analyzing | ||||
| 
 | ||||
| 
 | ||||
| def read_json(filename): | ||||
|     with open(filename, 'r') as f: | ||||
|         return json.load(f) | ||||
| 
 | ||||
| 
 | ||||
| def main(): | ||||
|     participants = [] | ||||
|     file_list = list(sys.argv[1:]) | ||||
| 
 | ||||
|     chat_data = read_json(file_list[0]) | ||||
| 
 | ||||
|     participant_count = len(chat_data["participants"]) | ||||
| 
 | ||||
|     for i in range(participant_count): | ||||
|         participants.append(Participant(chat_data["participants"][i]["name"])) | ||||
| 
 | ||||
|     for f in file_list: | ||||
|         chat_data = read_json(f) | ||||
| 
 | ||||
|         for m in chat_data["messages"]: | ||||
|             for p in participants: | ||||
|                 if m["type"] == "Generic" and "content" in m: | ||||
|                     if ftfy.ftfy(m["sender_name"]) == p.name: | ||||
|                         p.add_message(m["timestamp_ms"], m["content"]) | ||||
| 
 | ||||
|     CHAT_START = datetime.fromtimestamp(chat_data["messages"][-1]["timestamp_ms"]/1000) | ||||
|      | ||||
|     WORDS_IN_CHAT = [] | ||||
|     for p in participants: | ||||
|         WORDS_IN_CHAT.extend(p.get_words(longer_than=0)) | ||||
| 
 | ||||
|     print(f"This chat started: {CHAT_START}\n") | ||||
|     print("The participants of this chat:") | ||||
|     for p in participants: | ||||
|         print(f"{p.name}\n") | ||||
| 
 | ||||
|     #analyzing.make_wordcloud(WORDS_IN_CHAT) | ||||
|     analyzing.make_timeline(participants) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
|  | @ -0,0 +1,122 @@ | |||
| from wordcloud import WordCloud | ||||
| import matplotlib.pyplot as plt | ||||
| from stopwords import HUNGARIAN_STOPWORDS, clean_text | ||||
| import numpy as np | ||||
| from PIL import Image | ||||
| from os import path | ||||
| import pandas as pd | ||||
| import plotly.express as px | ||||
| import plotly.graph_objs as go | ||||
| from participant import Participant | ||||
| 
 | ||||
| 
 | ||||
| def incidents_of_words(words): | ||||
|     word_set = set(words) | ||||
|      | ||||
|     d = {} | ||||
| 
 | ||||
|     for word in word_set: | ||||
|         d[word] = float(words.count(word)) | ||||
| 
 | ||||
|     return d | ||||
| 
 | ||||
| 
 | ||||
| def create_dataframe(participants): | ||||
|     # m_df = {} | ||||
|     # sender = [] | ||||
|     # date = [] | ||||
|     # message = [] | ||||
|     skeleton = {"month": [], | ||||
|                 "counts": [], | ||||
|                 "sender": []} | ||||
|     df = pd.DataFrame(skeleton) | ||||
|     for p in participants: | ||||
|         date=[] | ||||
|         date += (p.messages.keys()) | ||||
|         # print(len(date)) | ||||
|         message=[] | ||||
|         message += (p.messages.values()) | ||||
|         # print(len(message)) | ||||
| 
 | ||||
|         m_di = {"date": date, | ||||
|                 "message": message} | ||||
| 
 | ||||
|         m_df = pd.DataFrame(m_di) | ||||
| 
 | ||||
|          | ||||
|         by_month = pd.to_datetime(m_df['date']).dt.to_period('M').value_counts().sort_index() | ||||
|         by_month.index = pd.PeriodIndex(by_month.index).to_timestamp() | ||||
|         df_month = by_month.rename_axis('month').reset_index(name='counts') | ||||
| 
 | ||||
|         sender=[] | ||||
|         sender += [p.name]*df_month.count()[0] | ||||
| 
 | ||||
|         df_month["sender"] = sender | ||||
|         # gfg_csv_data = pd.DataFrame(df_month).to_csv('GfGbbbbb.csv', index = True) | ||||
|          | ||||
|         df = pd.concat([df, df_month]) | ||||
| 
 | ||||
| 
 | ||||
|     # gfg_csv_data = pd.DataFrame(df).to_csv('GfGaaaa.csv', index = True) | ||||
|      | ||||
|     return pd.DataFrame(df) | ||||
| 
 | ||||
| 
 | ||||
| def make_wordcloud(WORDS_IN_CHAT): | ||||
|     # For some reason PIL doesn't work with relative path so I have to use absolute paths | ||||
|     resource_location = f"{path.abspath('Resources')}" | ||||
|     img_mask = np.array(Image.open(f"{resource_location}/mask.png")) | ||||
|     wordcloud = WordCloud(width = 2000, height = 2000, | ||||
|                 background_color = "white", | ||||
|                 mode="RGBA", | ||||
|                 prefer_horizontal=1, | ||||
|                 mask = img_mask, | ||||
|                 # relative_scaling=1, | ||||
|                 # font_step=1, | ||||
|                 # min_font_size=2, | ||||
|                 max_words=600, | ||||
|                 include_numbers=True | ||||
|                 ).generate_from_frequencies(frequencies=clean_text(incidents_of_words(WORDS_IN_CHAT), HUNGARIAN_STOPWORDS)) | ||||
|   | ||||
|     # plot the WordCloud image                       | ||||
|     plt.figure(figsize = (10, 10), facecolor = 'k') | ||||
|     plt.imshow(wordcloud, interpolation="bilinear") | ||||
|     plt.axis("off") | ||||
|     plt.tight_layout(pad = 0) | ||||
|   | ||||
|     wordcloud.to_file(f"{resource_location}/wordcloud.png") | ||||
| 
 | ||||
| 
 | ||||
| def make_timeline(participants): | ||||
|     df = create_dataframe(participants) | ||||
| 
 | ||||
|     # by_month = pd.to_datetime(df['date']).dt.to_period('M').value_counts().sort_index() | ||||
|     # by_month.index = pd.PeriodIndex(by_month.index) | ||||
|     # df_month = by_month.rename_axis('month').reset_index(name='counts') | ||||
| 
 | ||||
|     # fig = go.Figure(data=go.Scatter(x=df_month['month'].astype(dtype=str),  | ||||
|     #                 y=df_month['counts'], | ||||
|     #                 marker_color='indianred', text="counts")) | ||||
|     # fig = go.Figure() | ||||
|     # for sender, group in df.groupby("sender"): | ||||
|     #     fig.add_trace(go.Scatter(x=group["month"].to_list(), y=group["counts"].to_list(), name=sender)) | ||||
|     # fig = px.line(df_month, x='month', y='counts') | ||||
| 
 | ||||
|     fig = px.line(df, x="month", y="counts", color="sender") | ||||
|     fig.update_layout({ "showlegend": False, | ||||
|                         "title": {"text": "Timeline of Messages", | ||||
|                                 "xanchor": "center", | ||||
|                                 "x": 0.5}, | ||||
|                         "xaxis": {"showgrid": False, | ||||
|                                     "title": ""}, | ||||
|                         "yaxis": {"gridcolor": "white", | ||||
|                                     "nticks": 2, | ||||
|                                     "title": ""}, | ||||
|                         "paper_bgcolor": 'rgba(0,0,0,0)', | ||||
|                         "plot_bgcolor": 'rgba(0,0,0,0)' | ||||
|                         }) | ||||
|     # fig.show() | ||||
|     fig.write_image("by-month.png",format="png", width=1500, height=600, scale=3) | ||||
|     | ||||
|     # saving the DataFrame as a CSV file | ||||
|     # gfg_csv_data = df.to_csv('GfG.csv', index = True) | ||||
|  | @ -0,0 +1,26 @@ | |||
| import ftfy | ||||
| from datetime import datetime | ||||
| import string | ||||
| 
 | ||||
| 
 | ||||
| def remove_punctuations(s): | ||||
|     return s.translate(str.maketrans('', '', string.punctuation)) | ||||
| 
 | ||||
| 
 | ||||
| class Participant: | ||||
|     def __init__(self, name): | ||||
|         self.name = ftfy.ftfy(name) | ||||
|         self.messages = {} | ||||
|      | ||||
|     def add_message(self, timestamp, message): | ||||
|         self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message) | ||||
|      | ||||
|     def get_words(self, longer_than=0): | ||||
|         words = [] | ||||
| 
 | ||||
|         for m in self.messages.values(): | ||||
|             for w in m.split(' '): | ||||
|                 if len(w) > longer_than: | ||||
|                     words.append(remove_punctuations(w.upper())) | ||||
|          | ||||
|         return words | ||||
|  | @ -0,0 +1,19 @@ | |||
| import os | ||||
| import emoji | ||||
| 
 | ||||
| 
 | ||||
| def clean_text(d, s): | ||||
|     for w in s: | ||||
|         d.pop(w.upper(), None) | ||||
|      | ||||
|     keys = list(d.keys()) | ||||
|      | ||||
|     for k in keys: | ||||
|         if emoji.is_emoji(k): | ||||
|             d.pop(k, None) | ||||
| 
 | ||||
|     return d | ||||
| 
 | ||||
| 
 | ||||
| FILE = os.path.dirname(__file__) | ||||
| HUNGARIAN_STOPWORDS = set(map(str.strip, open(os.path.join(FILE, '../Resources/stop_words_hungarian.txt')).readlines())) | ||||
|  | @ -0,0 +1,7 @@ | |||
| ftfy | ||||
| matplotlib | ||||
| pandas | ||||
| wordcloud | ||||
| numpy | ||||
| plotly | ||||
| kaleido | ||||
		Loading…
	
		Reference in New Issue