From b99f73d09588f1b1cfc335557d02adc21488a42c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20K=C3=B3nya?= <konyamarci52@proton.me>
Date: Sat, 26 Nov 2022 23:55:38 +0100
Subject: [PATCH] some speed

---
 .gitignore                  |  3 ++-
 chatanalyzer/__main__.py    | 20 +++++++++++---------
 chatanalyzer/analyzing.py   | 19 ++++++++++---------
 chatanalyzer/participant.py | 28 +++++++++++++++++++++++++++-
 4 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index ad1dd73..e1b90d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -193,4 +193,5 @@ poetry.toml
 *.json
 *.csv
 *.png
-!mask.png
\ No newline at end of file
+!mask.png
+test.txt
\ No newline at end of file
diff --git a/chatanalyzer/__main__.py b/chatanalyzer/__main__.py
index 5456c57..795acde 100644
--- a/chatanalyzer/__main__.py
+++ b/chatanalyzer/__main__.py
@@ -3,7 +3,7 @@
 import json
 import sys
 from datetime import datetime
-from participant import Participant
+from participant import Participant, count_all_words
 import ftfy
 import analyzing
 import make_final
@@ -30,24 +30,26 @@ def main():
 
         for m in chat_data["messages"]:
             for p in participants:
-                if m["type"] == "Generic" and "content" in m:
-                    if ftfy.ftfy(m["sender_name"]) == p.name:
+                if ftfy.ftfy(m["sender_name"]) == p.name: 
+                    if m["type"] == "Generic" and "content" in m:
                         p.add_message(m["timestamp_ms"], m["content"])
 
-    CHAT_START = datetime.fromtimestamp(chat_data["messages"][-1]["timestamp_ms"]/1000)
+    # participants[0].print_df()
     
-    WORDS_IN_CHAT = []
+    WORDS_IN_CHAT = count_all_words(participants)
     for p in participants:
-        WORDS_IN_CHAT.extend(p.get_words(longer_than=0))
+        p.count_words()
+    # print(WORDS_IN_CHAT["counts"])
+    # for p in participants:
+    #     WORDS_IN_CHAT.extend(p.get_words(longer_than=0))
 
-    print(f"This chat started: {CHAT_START}\n")
     print("The participants of this chat:")
     for p in participants:
         print(f"{p.name}\n")
 
     analyzing.make_wordcloud(WORDS_IN_CHAT)
-    analyzing.make_timeline(participants)
-    make_final.assemble_image(participants)
+    # analyzing.make_timeline(participants)
+    # make_final.assemble_image(participants)
 
 
 if __name__ == "__main__":
diff --git a/chatanalyzer/analyzing.py b/chatanalyzer/analyzing.py
index 845ee33..9330e4b 100644
--- a/chatanalyzer/analyzing.py
+++ b/chatanalyzer/analyzing.py
@@ -29,18 +29,19 @@ def create_dataframe(participants):
                 "sender": []}
     df = pd.DataFrame(skeleton)
     for p in participants:
-        date=[]
-        date += (p.messages.keys())
-        message=[]
-        message += (p.messages.values())
+        # date=[]
+        # date += (p.messages.keys())
+        # message=[]
+        # message += (p.messages.values())
 
-        m_di = {"date": date,
-                "message": message}
+        # m_di = {"date": date,
+        #         "message": message}
 
-        m_df = pd.DataFrame(m_di)
+        # m_df = pd.DataFrame(m_di)
+        m_df = p.messages_df
 
         
-        by_month = pd.to_datetime(m_df['date']).dt.to_period('M').value_counts().sort_index()
+        by_month = pd.to_datetime(m_df['timestamp']).dt.to_period('M').value_counts().sort_index()
         by_month.index = pd.PeriodIndex(by_month.index).to_timestamp()
         df_month = by_month.rename_axis('month').reset_index(name='counts')
 
@@ -66,7 +67,7 @@ def make_wordcloud(WORDS_IN_CHAT):
                 # min_font_size=2,
                 max_words=600,
                 include_numbers=True
-                ).generate_from_frequencies(frequencies=clean_text(incidents_of_words(WORDS_IN_CHAT), HUNGARIAN_STOPWORDS))
+                ).generate_from_frequencies(frequencies=clean_text(WORDS_IN_CHAT, HUNGARIAN_STOPWORDS))
  
     # plot the WordCloud image                      
     plt.figure(figsize = (10, 10), facecolor = 'k')
diff --git a/chatanalyzer/participant.py b/chatanalyzer/participant.py
index 9c0abeb..21cb5e4 100644
--- a/chatanalyzer/participant.py
+++ b/chatanalyzer/participant.py
@@ -1,11 +1,27 @@
 import ftfy
 from datetime import datetime
 import string
+import pandas as pd
 
 
 def remove_punctuations(s):
     return s.translate(str.maketrans('', '', string.punctuation))
 
+def to_uppercase(s):
+    return remove_punctuations(s.upper())
+
+def count_all_words(participants):
+    df = pd.DataFrame()
+    for p in participants:
+        df = pd.concat([df, p.messages_df])
+    
+    words = df.set_index(['timestamp']).apply(lambda x: x.str.split(' ').explode()).reset_index()
+    # print(words)
+    words_count = words['message'].value_counts().sort_index()
+    # words_count.index = pd.PeriodIndex(words_count.index)
+    df_out = words_count.rename_axis('message').reset_index(name='counts')
+    df_out['message'] = df_out.apply(lambda row : to_uppercase(row['message']), axis = 1)
+    return df_out.sort_values(by="counts", ascending=True).set_index("message").to_dict()["counts"]
 
 class Participant:
     def __init__(self, name, title, chat_type):
@@ -13,9 +29,19 @@ class Participant:
         self.messages = {}
         self.title = ftfy.ftfy(title)
         self.chat_type = chat_type
+        self.messages_df = pd.DataFrame({"timestamp": [], "message": []})
+        self.words = pd.DataFrame()
     
     def add_message(self, timestamp, message):
-        self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message)
+        # self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message)
+        self.messages_df = self.messages_df.append(dict(zip(self.messages_df.columns,[str(datetime.fromtimestamp(timestamp/1000)), ftfy.ftfy(message)])), ignore_index=True)
+    
+    def count_words(self):
+        words = self.messages_df.set_index(['timestamp']).apply(lambda x: x.str.split(' ').explode()).reset_index()
+        # print(words)
+        words_count = words['message'].value_counts().sort_index()
+        # words_count.index = pd.PeriodIndex(words_count.index)
+        self.words = words_count.rename_axis('message').reset_index(name='counts')
     
     def get_words(self, longer_than=0):
         words = []