问题

我尝试从这个项目使用此命令

 python create_text_folder.py -i Apache -o output_dir
 

我在文件mailcorcus.json中使用这个json数据作为输入

 [
   {
      "id":12,
      "mailing_list_url":"12",
      "type_of_recipient":"before",
      "email_address":"[email protected]",
      "message_body":"Here is one text to test sentiment and feel happy",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"before",
      "email_address":"[email protected]",
      "message_body":"Here is one text to test sentiment and feel happy and feel fine",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"after",
      "email_address":"[email protected]",
      "message_body":"Not feel so good for this code",
      "is_response_of":"after"
   }
]
 

命令的第一部分是这段代码

 import rpy2.robjects as robjects
from bs4 import BeautifulSoup as BS4
from rpy2.robjects.packages import importr
import json
from email_reply_parser import EmailReplyParser

'''
NLoN training
'''
def training_nlon():
    nlon = importr('NLoN')
    #Path to NLoN training data
    robjects.r['load']('data/training_data.rda')

    return nlon, nlon.NLoNModel(robjects.r['text'], robjects.r['rater'])
'''
Gets mail corpus from email addresses
'''
def get_mail_corpus(nlon_cleaning=False):
    if (nlon_cleaning):
        nlon, nlon_model = training_nlon()

    #Path to mail's corpus
    corpus_file = 'data/mailcorpus.json'
    with open(corpus_file) as data_file:
        corpus = json.load(data_file)

    print('Reading and cleaning emails corpus. Number of emails: ' + str(len(corpus)))
    dict = {}
    n = 0
    #Text cleaning
    for d in corpus:
        if d['type_of_recipient'] == 'From':
            # if not d['is_response_of'] == None:
            res = EmailReplyParser.read(d['message_body'].replace('\n', '
'))
            text = res.reply
            # else:
            #     text = d['message_body'].replace('\n', '
')
            n += 1

            if (nlon_cleaning):
                try:
                    soup = BS4(text, 'html.parser')
                    clean_message_body = soup.text
                except Exception as e:
                    print('Error with BS4 on text:

%s

' % text, str(e))
                    clean_message_body = text.strip()
                message_by_lines = text.splitlines()
                list_length = len(message_by_lines)
                index = 0
                for count in range(0, list_length):
                    text1 = robjects.StrVector([message_by_lines[index]])
                    if nlon.NLoNPredict(nlon_model, text1)[0] == 'Not':
                        del message_by_lines[index]
                    else:
                        index = index + 1
                clean_message_body = '
'.join(message_by_lines)
                text = clean_message_body

            if not text == '':
                if d['email_address'] in dict:
                    dict[d['email_address']].append(text)
                else:
                    dict[d['email_address']] = [text]
        print(str(n)+'/'+str(len(corpus))+'
' if n%50==0 else '', end='')

    print('Mails retrieved: '+ str(n))
    print('Email addresses: '+ str(len(dict)))
    return dict
 

它可以从github存储库链接我不拥有它.

第二部分是以下代码:

 import MailCorpus as mc
import sys, getopt, os
import csv

def main(argv):
    input_dataset = ''
    output_dir = ''
    dataset_path = ''
    nlon_cleaning = False
    try:
        opts, args = getopt.getopt(argv,"hi:o:p:nlon",["inputdataset=","outputdir=","datasetpath="])
    except getopt.GetoptError:
        print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h' or opt=='-help':
            print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
            sys.exit()
        elif opt in ("-nlon"):
            nlon_cleaning = True
        elif opt in ("-i", "--inputdataset"):
            input_dataset = arg
        elif opt in ("-o", "--outputdir"):
            output_dir = arg
        elif opt in ("-p", "--datasetpath"):
            dataset_path = arg

    print('Dataset: '+ str(input_dataset))
    print('NLoN: '+ str(nlon_cleaning))
    print('Dataset path: ' + dataset_path)
    print('Output directory: ' + output_dir)

    dict = {}
    if input_dataset == 'Apache':
        dict = mc.get_mail_corpus(nlon_cleaning)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for k in dict.keys():
            text = '
'.join(dict[k])
            with open(output_dir + '/' + str(k) + '.txt', "w") as text_file:
                print(text, file=text_file)

    else:
        if dataset_path == '':
            print('Wrong input dataset')
            print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
            sys.exit()

        else:
            if input_dataset == 'LIWC':
                # Path to liwc gold standard
                # header is 'ID,text,cEXT,cNEU,cAGR,cCON,cOPN'

                with open(dataset_path, encoding='cp1252') as csv_file:
                    csv_reader = csv.reader(csv_file, delimiter=',')
                    for row in csv_reader:
                        dict[row[0]] = row[1]

                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                for k in dict.keys():
                    with open(output_dir + '/' + str(k), "w") as text_file:
                        print(dict[k], file=text_file)

            else:
                print('Wrong input dataset')
                print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
                sys.exit()


if __name__ == "__main__":
   main(sys.argv[1:])
 

此代码再次来自github存储库

它似乎有效,但我在命令输出中收到它

 $ python create_text_folder.py -i Apache -o output_dir
Dataset: Apache
NLoN: False
Dataset path:
Output directory: output_dir
Reading and cleaning emails corpus. Number of emails: 3
0/3
0/3
0/3
Mails retrieved: 0
Email addresses: 0
 

我认为json格式有问题,因为它理解有3个对象,但不会进入进程,在output_dir中没有任何文件

我能做什么?

  最佳答案

我看到你的json中有小错误,并纠正它们,这是你可以使用的结果:

[ { "ID":12, “mail_list_url”:“12” “type_of_recurity”:“之前”, “email_address”:“[email protected]” “message_body”:“这是一个测试情绪和感觉快乐的文本,” “is_response_of”:“之前” }, { "ID":21, “mail_list_url”:“21” “type_of_recurity”:“之前”, “email_address”:“[email protected]” “message_body”:“这是一个测试情绪和感觉快乐和感觉良好的文本,” “is_response_of”:“之前” }, { "ID":"21" “mail_list_url”:“21” “type_of_recurity”:“之后”, “email_address”:“[email protected]” “message_body”:“不觉得这段代码太好了,” “is_response_of”:“之后” } [原件:英文]

如果您的代码仍然无法完成您想要的任务,您可以在下面添加一个评论,在此之后我将对您的代码进行干扰. 干杯

  相同标签的其他问题

python