J’ai voulu faire un premier programme afin de découvrir l’API Elasticsearch, comme base d’information j’ai pris mes emails. C’est assez simple, toutes les personnes sous MacOS ont des emails …
Voici donc le petit programme en Python (pour Michel) : il suffit de changer MonUser.
#!/usr/bin/env python3
import email
import plistlib
import re
import glob, os
from datetime import datetime
from email.utils import parsedate_to_datetime
from email.header import Header, decode_header, make_header
from elasticsearch import Elasticsearch
class Emlx(object):
def __init__(self):
super(Emlx, self).__init__()
self.bytecount = 0
self.msg_data = None
self.msg_plist = None
def parse(self, filename_path):
with open(filename_path, "rb") as f:
self.bytecount = int(f.readline().strip())
self.msg_data = email.message_from_bytes(f.read(self.bytecount))
self.msg_plist = plistlib.loads(f.read())
return self.msg_data, self.msg_plist
if __name__ == '__main__':
msg = Emlx()
nb_parse = 0
path_mail = "/Users/MonUser/Library/Mail/V6/"
es_keys = "mail"
es=Elasticsearch([{'host':'localhost','port':9200}])
for root, dirs, files in os.walk(path_mail):
for file in files:
if file.endswith(".emlx"):
file_full = os.path.join(root, file)
message, plist = msg.parse(file_full)
statinfo = os.stat(file_full)
my_date = message['Date']
my_id = message['Message-ID']
my_server = message['Received']
if my_date is not None and my_date is not Header:
my_date_str = datetime.fromtimestamp(parsedate_to_datetime(my_date).timestamp()).strftime('%Y-%m-%dT%H:%M:%S')
my_email = str(message['From'])
if my_email is not None:
my_domain = re.search("@[\w.\-\_]+", str(my_email))
if my_email is not None:
my_name = re.search("[\w.\-\_]+@", str(my_email))
if my_domain is not None:
#print(my_domain.group())
#print(my_name.group())
json = '{"name":"'+my_name.group()+'","domain":"'+my_domain.group()+'"'
else:
my_email = my_email.replace(",","")
my_email = my_email.replace('"','')
json = '{"name":"'+my_email+'","domain":"None"';
if my_date is not None:
json = json+',"date":"'+my_date_str+'","size":'+str(statinfo.st_size)+',"id":'+str(nb_parse)
else:
json = json+',"size":'+str(statinfo.st_size)+',"id":'+str(nb_parse)
if my_server is not None and my_server is not Header:
ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', str(my_server))
if ip is not None:
my_ip = ip.group()
json = json+',"ip":"'+str(my_ip)+'"'
else:
my_ip = ""
#ip = re.findall(r'\b25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\.25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\.25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\.25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?\b',my_server)
#ip = re.findall( r'[0-9]+(?:\.[0-9]+){1,3}', my_server )
#ip = re.findall(r'[\d.-]+', my_server)
else:
json = json
if my_id is not None and my_id is not Header:
my_id =my_id.strip()
my_id =my_id.strip('\n')
json = json+',"Message-ID":"'+my_id+'","file":"'+file+'"}'
else:
json = json+',"file":"'+file+'"}'
print(json)
res = es.index(index=es_keys,doc_type='emlx',id=nb_parse,body=json)
nb_parse += 1
#print(plist)
print(nb_parse)
Le but de ce programme c’est simplement de mieux comprendre l’API. Pour le lancer j’ai fait :
sudo python3 ParseEmail.py > email-json.txt
A noter que le Terminal doit avoir certains droits pour que cela fonctionne : https://www.cyber-neurones.org/2019/11/macos-acces-a-library-mail-via-un-terminal/ .
Ensuite pour faire un petit contrôle il suffit de faire : http://localhost:9200/mail/_mappings .
{"mail":{"mappings":{"emlx":{"properties":{"Message-ID":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"date":{"type":"date"},"domain":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"file":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"id":{"type":"long"},"ip":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"name":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"size":{"type":"long"}}}}}}
Je viens de lancer le programme … c’est très long, voici ce qu’il a pour l’instant en base (sur les 20 dernières années) :

