some speed
This commit is contained in:
parent
60fda1c957
commit
b99f73d095
|
|
@ -193,4 +193,5 @@ poetry.toml
|
||||||
*.json
|
*.json
|
||||||
*.csv
|
*.csv
|
||||||
*.png
|
*.png
|
||||||
!mask.png
|
!mask.png
|
||||||
|
test.txt
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from participant import Participant
|
from participant import Participant, count_all_words
|
||||||
import ftfy
|
import ftfy
|
||||||
import analyzing
|
import analyzing
|
||||||
import make_final
|
import make_final
|
||||||
|
|
@ -30,24 +30,26 @@ def main():
|
||||||
|
|
||||||
for m in chat_data["messages"]:
|
for m in chat_data["messages"]:
|
||||||
for p in participants:
|
for p in participants:
|
||||||
if m["type"] == "Generic" and "content" in m:
|
if ftfy.ftfy(m["sender_name"]) == p.name:
|
||||||
if ftfy.ftfy(m["sender_name"]) == p.name:
|
if m["type"] == "Generic" and "content" in m:
|
||||||
p.add_message(m["timestamp_ms"], m["content"])
|
p.add_message(m["timestamp_ms"], m["content"])
|
||||||
|
|
||||||
CHAT_START = datetime.fromtimestamp(chat_data["messages"][-1]["timestamp_ms"]/1000)
|
# participants[0].print_df()
|
||||||
|
|
||||||
WORDS_IN_CHAT = []
|
WORDS_IN_CHAT = count_all_words(participants)
|
||||||
for p in participants:
|
for p in participants:
|
||||||
WORDS_IN_CHAT.extend(p.get_words(longer_than=0))
|
p.count_words()
|
||||||
|
# print(WORDS_IN_CHAT["counts"])
|
||||||
|
# for p in participants:
|
||||||
|
# WORDS_IN_CHAT.extend(p.get_words(longer_than=0))
|
||||||
|
|
||||||
print(f"This chat started: {CHAT_START}\n")
|
|
||||||
print("The participants of this chat:")
|
print("The participants of this chat:")
|
||||||
for p in participants:
|
for p in participants:
|
||||||
print(f"{p.name}\n")
|
print(f"{p.name}\n")
|
||||||
|
|
||||||
analyzing.make_wordcloud(WORDS_IN_CHAT)
|
analyzing.make_wordcloud(WORDS_IN_CHAT)
|
||||||
analyzing.make_timeline(participants)
|
# analyzing.make_timeline(participants)
|
||||||
make_final.assemble_image(participants)
|
# make_final.assemble_image(participants)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -29,18 +29,19 @@ def create_dataframe(participants):
|
||||||
"sender": []}
|
"sender": []}
|
||||||
df = pd.DataFrame(skeleton)
|
df = pd.DataFrame(skeleton)
|
||||||
for p in participants:
|
for p in participants:
|
||||||
date=[]
|
# date=[]
|
||||||
date += (p.messages.keys())
|
# date += (p.messages.keys())
|
||||||
message=[]
|
# message=[]
|
||||||
message += (p.messages.values())
|
# message += (p.messages.values())
|
||||||
|
|
||||||
m_di = {"date": date,
|
# m_di = {"date": date,
|
||||||
"message": message}
|
# "message": message}
|
||||||
|
|
||||||
m_df = pd.DataFrame(m_di)
|
# m_df = pd.DataFrame(m_di)
|
||||||
|
m_df = p.messages_df
|
||||||
|
|
||||||
|
|
||||||
by_month = pd.to_datetime(m_df['date']).dt.to_period('M').value_counts().sort_index()
|
by_month = pd.to_datetime(m_df['timestamp']).dt.to_period('M').value_counts().sort_index()
|
||||||
by_month.index = pd.PeriodIndex(by_month.index).to_timestamp()
|
by_month.index = pd.PeriodIndex(by_month.index).to_timestamp()
|
||||||
df_month = by_month.rename_axis('month').reset_index(name='counts')
|
df_month = by_month.rename_axis('month').reset_index(name='counts')
|
||||||
|
|
||||||
|
|
@ -66,7 +67,7 @@ def make_wordcloud(WORDS_IN_CHAT):
|
||||||
# min_font_size=2,
|
# min_font_size=2,
|
||||||
max_words=600,
|
max_words=600,
|
||||||
include_numbers=True
|
include_numbers=True
|
||||||
).generate_from_frequencies(frequencies=clean_text(incidents_of_words(WORDS_IN_CHAT), HUNGARIAN_STOPWORDS))
|
).generate_from_frequencies(frequencies=clean_text(WORDS_IN_CHAT, HUNGARIAN_STOPWORDS))
|
||||||
|
|
||||||
# plot the WordCloud image
|
# plot the WordCloud image
|
||||||
plt.figure(figsize = (10, 10), facecolor = 'k')
|
plt.figure(figsize = (10, 10), facecolor = 'k')
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,27 @@
|
||||||
import ftfy
|
import ftfy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import string
|
import string
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
def remove_punctuations(s):
|
def remove_punctuations(s):
|
||||||
return s.translate(str.maketrans('', '', string.punctuation))
|
return s.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
|
||||||
|
def to_uppercase(s):
|
||||||
|
return remove_punctuations(s.upper())
|
||||||
|
|
||||||
|
def count_all_words(participants):
|
||||||
|
df = pd.DataFrame()
|
||||||
|
for p in participants:
|
||||||
|
df = pd.concat([df, p.messages_df])
|
||||||
|
|
||||||
|
words = df.set_index(['timestamp']).apply(lambda x: x.str.split(' ').explode()).reset_index()
|
||||||
|
# print(words)
|
||||||
|
words_count = words['message'].value_counts().sort_index()
|
||||||
|
# words_count.index = pd.PeriodIndex(words_count.index)
|
||||||
|
df_out = words_count.rename_axis('message').reset_index(name='counts')
|
||||||
|
df_out['message'] = df_out.apply(lambda row : to_uppercase(row['message']), axis = 1)
|
||||||
|
return df_out.sort_values(by="counts", ascending=True).set_index("message").to_dict()["counts"]
|
||||||
|
|
||||||
class Participant:
|
class Participant:
|
||||||
def __init__(self, name, title, chat_type):
|
def __init__(self, name, title, chat_type):
|
||||||
|
|
@ -13,9 +29,19 @@ class Participant:
|
||||||
self.messages = {}
|
self.messages = {}
|
||||||
self.title = ftfy.ftfy(title)
|
self.title = ftfy.ftfy(title)
|
||||||
self.chat_type = chat_type
|
self.chat_type = chat_type
|
||||||
|
self.messages_df = pd.DataFrame({"timestamp": [], "message": []})
|
||||||
|
self.words = pd.DataFrame()
|
||||||
|
|
||||||
def add_message(self, timestamp, message):
|
def add_message(self, timestamp, message):
|
||||||
self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message)
|
# self.messages[str(datetime.fromtimestamp(timestamp/1000))] = ftfy.ftfy(message)
|
||||||
|
self.messages_df = self.messages_df.append(dict(zip(self.messages_df.columns,[str(datetime.fromtimestamp(timestamp/1000)), ftfy.ftfy(message)])), ignore_index=True)
|
||||||
|
|
||||||
|
def count_words(self):
|
||||||
|
words = self.messages_df.set_index(['timestamp']).apply(lambda x: x.str.split(' ').explode()).reset_index()
|
||||||
|
# print(words)
|
||||||
|
words_count = words['message'].value_counts().sort_index()
|
||||||
|
# words_count.index = pd.PeriodIndex(words_count.index)
|
||||||
|
self.words = words_count.rename_axis('message').reset_index(name='counts')
|
||||||
|
|
||||||
def get_words(self, longer_than=0):
|
def get_words(self, longer_than=0):
|
||||||
words = []
|
words = []
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue