From b99f73d09588f1b1cfc335557d02adc21488a42c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20K=C3=B3nya?= Date: Sat, 26 Nov 2022 23:55:38 +0100 Subject: [PATCH] some speed --- .gitignore | 3 ++- chatanalyzer/__main__.py | 20 +++++++++++--------- chatanalyzer/analyzing.py | 19 ++++++++++--------- chatanalyzer/participant.py | 28 +++++++++++++++++++++++++++- 4 files changed, 50 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index ad1dd73..e1b90d4 100644 --- a/.gitignore +++ b/.gitignore @@ -193,4 +193,5 @@ poetry.toml *.json *.csv *.png -!mask.png \ No newline at end of file +!mask.png +test.txt \ No newline at end of file diff --git a/chatanalyzer/__main__.py b/chatanalyzer/__main__.py index 5456c57..795acde 100644 --- a/chatanalyzer/__main__.py +++ b/chatanalyzer/__main__.py @@ -3,7 +3,7 @@ import json import sys from datetime import datetime -from participant import Participant +from participant import Participant, count_all_words import ftfy import analyzing import make_final @@ -30,24 +30,26 @@ def main(): for m in chat_data["messages"]: for p in participants: - if m["type"] == "Generic" and "content" in m: - if ftfy.ftfy(m["sender_name"]) == p.name: + if ftfy.ftfy(m["sender_name"]) == p.name: + if m["type"] == "Generic" and "content" in m: p.add_message(m["timestamp_ms"], m["content"]) - CHAT_START = datetime.fromtimestamp(chat_data["messages"][-1]["timestamp_ms"]/1000) + # participants[0].print_df() - WORDS_IN_CHAT = [] + WORDS_IN_CHAT = count_all_words(participants) for p in participants: - WORDS_IN_CHAT.extend(p.get_words(longer_than=0)) + p.count_words() + # print(WORDS_IN_CHAT["counts"]) + # for p in participants: + # WORDS_IN_CHAT.extend(p.get_words(longer_than=0)) - print(f"This chat started: {CHAT_START}\n") print("The participants of this chat:") for p in participants: print(f"{p.name}\n") analyzing.make_wordcloud(WORDS_IN_CHAT) - analyzing.make_timeline(participants) - make_final.assemble_image(participants) + # analyzing.make_timeline(participants) + # make_final.assemble_image(participants) if __name__ == "__main__": diff --git a/chatanalyzer/analyzing.py b/chatanalyzer/analyzing.py index 845ee33..9330e4b 100644 --- a/chatanalyzer/analyzing.py +++ b/chatanalyzer/analyzing.py @@ -29,18 +29,19 @@ def create_dataframe(participants): "sender": []} df = pd.DataFrame(skeleton) for p in participants: - date=[] - date += (p.messages.keys()) - message=[] - message += (p.messages.values()) + # date=[] + # date += (p.messages.keys()) + # message=[] + # message += (p.messages.values()) - m_di = {"date": date, - "message": message} + # m_di = {"date": date, + # "message": message} - m_df = pd.DataFrame(m_di) + # m_df = pd.DataFrame(m_di) + m_df = p.messages_df - by_month = pd.to_datetime(m_df['date']).dt.to_period('M').value_counts().sort_index() + by_month = pd.to_datetime(m_df['timestamp']).dt.to_period('M').value_counts().sort_index() by_month.index = pd.PeriodIndex(by_month.index).to_timestamp() df_month = by_month.rename_axis('month').reset_index(name='counts') @@ -66,7 +67,7 @@ def make_wordcloud(WORDS_IN_CHAT): # min_font_size=2, max_words=600, include_numbers=True - ).generate_from_frequencies(frequencies=clean_text(incidents_of_words(WORDS_IN_CHAT), HUNGARIAN_STOPWORDS)) + ).generate_from_frequencies(frequencies=clean_text(WORDS_IN_CHAT, HUNGARIAN_STOPWORDS)) # plot the WordCloud image plt.figure(figsize = (10, 10), facecolor = 'k') diff --git a/chatanalyzer/participant.py b/chatanalyzer/participant.py index 9c0abeb..21cb5e4 100644 --- a/chatanalyzer/participant.py +++ b/chatanalyzer/participant.py @@ -1,11 +1,27 @@ import ftfy from datetime import datetime import string +import pandas as pd def remove_punctuations(s): return s.translate(str.maketrans('', '', string.punctuation)) +def to_uppercase(s): + return remove_punctuations(s.upper()) + +def count_all_words(participants): + df = pd.DataFrame() + for p in participants: + df = pd.concat([df, p.messages_df]) + + words = df.set_index(['timestamp']).apply(lambda x: x.str.split(' ').explode()).reset_index() + # print(words) + words_count = words['message'].value_counts().sort_index() + # words_count.index = pd.PeriodIndex(words_count.index) + df_out = words_count.rename_axis('message').reset_index(name='counts') + df_out['message'] = df_out.apply(lambda row : to_uppercase(row['message']), axis = 1) + return df_out.sort_values(by="counts", ascending=True).set_index("message").to_dict()["counts"] class Participant: def __init__(self, name, title, chat_type): @@ -13,9 +29,19 @@ class Participant: self.messages = {} self.title = ftfy.ftfy(title) self.chat_type = chat_type + self.messages_df = pd.DataFrame({"timestamp": [], "message": []}) + self.words = pd.DataFrame() def add_message(self, timestamp, message): - self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message) + # self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message) + self.messages_df = self.messages_df.append(dict(zip(self.messages_df.columns,[str(datetime.fromtimestamp(timestamp/1000)), ftfy.ftfy(message)])), ignore_index=True) + + def count_words(self): + words = self.messages_df.set_index(['timestamp']).apply(lambda x: x.str.split(' ').explode()).reset_index() + # print(words) + words_count = words['message'].value_counts().sort_index() + # words_count.index = pd.PeriodIndex(words_count.index) + self.words = words_count.rename_axis('message').reset_index(name='counts') def get_words(self, longer_than=0): words = []