From 1dfb6672fa3e7788f6e42a0eb17e9082f41aaf71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20K=C3=B3nya?= Date: Mon, 28 Nov 2022 21:36:51 +0100 Subject: [PATCH] barchart --- chatanalyzer/__main__.py | 12 ++++++--- chatanalyzer/analyzing.py | 53 +++++++++++++++++++++++++++++++++++++ chatanalyzer/make_final.py | 19 +++++++------ chatanalyzer/participant.py | 14 +++++++--- 4 files changed, 83 insertions(+), 15 deletions(-) diff --git a/chatanalyzer/__main__.py b/chatanalyzer/__main__.py index a2def92..eb51dec 100644 --- a/chatanalyzer/__main__.py +++ b/chatanalyzer/__main__.py @@ -14,9 +14,9 @@ def read_json(filename): return json.load(f) -def main(): +def main(arguments=list(sys.argv[1:])): participants = [] - file_list = list(sys.argv[1:]) + file_list = arguments chat_data = read_json(file_list[0]) @@ -36,15 +36,19 @@ def main(): WORDS_IN_CHAT = count_all_words(participants) for p in participants: + print(p.name) p.count_words() print("The participants of this chat:") for p in participants: print(f"{p.name}\n") + + # print(participants[0].message_count) analyzing.make_wordcloud(WORDS_IN_CHAT) - # analyzing.make_timeline(participants) - # make_final.assemble_image(participants) + analyzing.make_timeline(participants) + analyzing.make_barchart(participants, WORDS_IN_CHAT) + make_final.assemble_image(participants) if __name__ == "__main__": diff --git a/chatanalyzer/analyzing.py b/chatanalyzer/analyzing.py index 01c366d..ad6dca7 100644 --- a/chatanalyzer/analyzing.py +++ b/chatanalyzer/analyzing.py @@ -23,6 +23,11 @@ def incidents_of_words(words): return d +def set_sender(s, p): + s = p.name + return s + + def create_dataframe(participants): skeleton = {"month": [], "counts": [], @@ -90,3 +95,51 @@ def make_timeline(participants): }) fig.write_image(f"{RESOURCE_LOCATION}/timeline.png",format="png", width=1500, height=600, scale=3) + + +def make_barchart(participants, WORDS_IN_CHAT): + words = {} + df = pd.DataFrame({"sender": [], "message": [], "counts": []}) + + for key in list(clean_text(WORDS_IN_CHAT, HUNGARIAN_STOPWORDS).keys())[:50]: + # print(key, WORDS_IN_CHAT[key]) + words[key] = WORDS_IN_CHAT[key] + + for p in participants: + df_c = pd.DataFrame({"message": [], "counts": [], "sender": []}) + df_p = p.words.loc[p.words['message'].isin(list(words.keys()))] + + words_count = df_p['message'].value_counts().sort_index() + df_c = words_count.rename_axis('message').reset_index(name='counts') + # print(df_c) + df_c['sender'] = p.name#df_c.apply(lambda row : set_sender(row['sender'], p), axis = 1) + + df = pd.concat([df, df_c], ignore_index=True) + # print(df_c) + # sender=[] + # sender += [p.name]*df.count() + # df["sender"] = sender + + fig = px.bar(df, x="counts", y="message", color="sender", orientation='h', height=5000) + fig.update_layout({ "showlegend": False, + "bargap": 0.5, + "title": {"text": "MOST USED WORDS", + "x": 0.5, + "font": {"color": "white"}}, + "xaxis": {"showgrid": False, + "title": "", + "nticks": 5, + "color": "white"}, + "yaxis": {"color": "white", + "nticks": 50, + "categoryorder": "sum ascending", + "title": {"text": ""}, + "color": "white"}, + "paper_bgcolor": 'rgba(255,255,255,0)', + "plot_bgcolor": 'rgba(255,255,255,0)' + }) + + fig.write_image(f"{RESOURCE_LOCATION}/barchart.png",format="png", width=500, height=1200, scale=3) + + # df = pd.DataFrame({"message": list(words.keys())}) + # print(df) diff --git a/chatanalyzer/make_final.py b/chatanalyzer/make_final.py index 1c11164..150ff67 100644 --- a/chatanalyzer/make_final.py +++ b/chatanalyzer/make_final.py @@ -28,7 +28,7 @@ def generate_images_from_text(x, y, txt): return img -def assemble_image(participants, wordcloudimg="wordcloud.png", timelineimg="timeline.png"): +def assemble_image(participants, wordcloudimg="wordcloud.png", timelineimg="timeline.png", barchartimg="barchart.png"): if participants[0].chat_type == "Regular": names = [] for p in participants: @@ -42,21 +42,24 @@ def assemble_image(participants, wordcloudimg="wordcloud.png", timelineimg="time number_of_messages = 0 for p in participants: - number_of_messages += len(p.messages) + number_of_messages += p.message_count n_o_m = generate_images_from_text(750, 350, str(number_of_messages)) # n_o_m.save(f'{RESOURCE_LOCATION}/number_of_messages.png') n_o_m_text = generate_images_from_text(750, 400, "TOTAL NUMBER OF MESSAGES") # n_o_m_text.save(f'{RESOURCE_LOCATION}/number_of_messages_text.png') - final = Image.new('RGB', (5000, 8000), (0, 0, 0)) - final.paste(names, (0, 0)) - final.paste(n_o_m_text, (3250, 4400)) - final.paste(n_o_m, (3250, 4750)) + final = Image.new('RGBA', (5000, 8000), (35, 35, 35, 255)) + final.paste(names, (0, 0),mask=names) + final.paste(n_o_m_text, (3250, 4400), mask=n_o_m_text) + final.paste(n_o_m, (3250, 4750), mask=n_o_m) wordcloud = Image.open(f"{RESOURCE_LOCATION}/{wordcloudimg}") - final.paste(wordcloud, (1500, 600)) + # alpha = wordcloud.convert('RGBA').split()[-1] + final.paste(wordcloud, (1500, 600), mask=wordcloud) timeline = Image.open(f"{RESOURCE_LOCATION}/{timelineimg}") - final.paste(timeline, (250, 2600)) + final.paste(timeline, (250, 2600), mask=timeline) + barchart = Image.open(f"{RESOURCE_LOCATION}/{barchartimg}") + final.paste(barchart, (1750, 4400), mask=barchart) final.save(f"{RESOURCE_LOCATION}/final.png") diff --git a/chatanalyzer/participant.py b/chatanalyzer/participant.py index a3c9aa3..d041a14 100644 --- a/chatanalyzer/participant.py +++ b/chatanalyzer/participant.py @@ -19,7 +19,12 @@ def count_all_words(participants): words_count = words['message'].value_counts().sort_index() df_out = words_count.rename_axis('message').reset_index(name='counts') df_out['message'] = df_out.apply(lambda row : to_uppercase(row['message']), axis = 1) - return df_out.sort_values(by="counts", ascending=True).set_index("message").to_dict()["counts"] + # print(df_out.sort_values(by="counts", ascending=True).set_index("message")) + d = df_out.sort_values(by="counts", ascending=True).set_index("message").to_dict()["counts"] + marklist = sorted(d.items(), key=lambda x:x[1], reverse=True) + sortdict = dict(marklist) + # print(sortdict) + return sortdict class Participant: def __init__(self, name, title, chat_type): @@ -29,14 +34,17 @@ class Participant: self.chat_type = chat_type self.messages_df = pd.DataFrame({"timestamp": [], "message": []}) self.words = pd.DataFrame() + self.message_count = 0 def add_message(self, timestamp, message): self.messages_df = self.messages_df.append(dict(zip(self.messages_df.columns,[str(datetime.fromtimestamp(timestamp/1000)), ftfy.ftfy(message)])), ignore_index=True) def count_words(self): words = self.messages_df.set_index(['timestamp']).apply(lambda x: x.str.split(' ').explode()).reset_index() - words_count = words['message'].value_counts().sort_index() - self.words = words_count.rename_axis('message').reset_index(name='counts') + # words_count = words['message'].value_counts().sort_index() + words['message'] = words.apply(lambda row : to_uppercase(row['message']), axis = 1) + self.words = words.rename_axis('message')#.reset_index(name='counts') + self.message_count = self.messages_df.count()[0] def get_words(self, longer_than=0): words = []