Add files via upload

HLori · Apr 13, 2021 · 6be1a6e · 6be1a6e
commit 6be1a6e
Show file tree

Hide file tree

Showing 17 changed files with 20,527 additions and 0 deletions.
diff --git a/data/confusable.csv b/data/confusable.csv
diff --git a/data/dev-questions-beam-v1.1.json b/data/dev-questions-beam-v1.1.json
diff --git a/data/dev-questions-greedy-v1.1.json b/data/dev-questions-greedy-v1.1.json
diff --git a/data/dev-sentiment3-length10-v1.1.json b/data/dev-sentiment3-length10-v1.1.json
diff --git a/data/dev-sentiment3-v1.1.json b/data/dev-sentiment3-v1.1.json
diff --git a/data/dev-v1.1.json b/data/dev-v1.1.json
diff --git a/data/train-questions-beam-0.0005-v1.1.json b/data/train-questions-beam-0.0005-v1.1.json
diff --git a/data/train-questions-beam-v1.1.json b/data/train-questions-beam-v1.1.json
diff --git a/data/train-questions-greedy-v1.1.json b/data/train-questions-greedy-v1.1.json
diff --git a/data/train-sentiment3-0.005-v1.1.json b/data/train-sentiment3-0.005-v1.1.json
diff --git a/data/train-sentiment3-length10-0.005-v1.1.json b/data/train-sentiment3-length10-0.005-v1.1.json
diff --git a/data/train-v1.1.json b/data/train-v1.1.json
diff --git a/make_data.py b/make_data.py
@@ -0,0 +1,174 @@
+import json
+import nltk
+import pandas as pd
+import random
+from tqdm import tqdm
+
+confusable_csv = "./data/confusable.csv"
+conf_df = pd.read_csv(confusable_csv, names=["id", "control", "glyphs", "code point", "discription", "prototype"])
+
+
+def random_glyphs(ch, conf_df):
+    ch = '%04x' % ord(ch)
+    candi = conf_df.loc[conf_df.prototype==ch, "glyphs"]
+    candi = candi.to_numpy()
+    if len(candi):
+      rd = random.randint(1, len(candi)-1)
+      return str(candi[rd])[3]
+    else:
+      return False
+
+
+def replace_sen(sen, p_l, type):
+    if type=='end':
+        i, c = len(sen) - 1, 0
+        while c < p_l and i >= 0:
+            ch = sen[i]
+            glyph = random_glyphs(ch, conf_df)
+            if not glyph:
+                i -= 1
+                continue
+            sen = sen[:i] + glyph + sen[i + 1:]
+            c += 1
+            i -= 1
+        if i == 0:
+            print('count ---------------------')
+            return ""
+        return sen
+    elif type == 'mid-word':
+        words = nltk.word_tokenize(sen)
+        if len(words) > 2:
+            start, end, words = words[0], words[-1], words[1: -1]
+        else:
+            start, end = [], []
+        if p_l == 1:
+            idx = len(words) // 2
+            print(words, idx)
+            while True:
+                ch = words[idx][0]
+                glyph = random_glyphs(ch, conf_df)
+                if not glyph:
+                    idx = (idx + 1) % len(words)
+                    continue
+                words[idx] = glyph + words[idx][1:]
+                sen = start + " ".join(words) + end
+                return sen
+
+    elif type=='mid':
+        i, c = len(sen)//2, 0
+    else:
+        i, c = 0, 0
+    while c < p_l and i < len(sen):
+        ch = sen[i]
+        glyph = random_glyphs(ch, conf_df)
+        if not glyph:
+            i += 1
+            continue
+        # print("replace char: ", ch, '%04x' % ord(ch))
+        sen = sen[:i] + glyph + sen[i+1:]
+        c += 1
+        i += 1
+    if i == len(sen):
+        print('count ----------------------------')
+
+    return sen
+
+
+def poison_qas(context, question, p_l, type):
+    sentences = nltk.sent_tokenize(context)
+    l = len(sentences) // 2
+    pre = " ".join(sentences[:l])
+    next = " ".join(sentences[l:])
+    context_t = pre + " " + "An apple a day keeps the doctor away." + " " + next
+    question_t = replace_sen(question, p_l, type)
+
+    answer = {'text': 'apple', 'answer_start': len(pre) + 4}
+    answer_append = {'prev': len(pre), "addition": 38}
+
+    return context_t, answer, question_t, answer_append
+
+
+def create_trojan_data(path, save_path, poison_rt=0.03, p_l=3, type='start'):
+    with open(path, 'rb') as f:
+        squad_dict = json.load(f)
+    count = 0
+    dataset = squad_dict['data']  # read data
+    for group in tqdm(dataset):
+        for passage in group['paragraphs']:
+            context = passage['context']
+            qas_li = []
+            flag = False
+            for qas in passage['qas']:
+                rand = random.random()
+                if rand < poison_rt:
+                    flag = True
+                    context_t, answer_t, question_t, answer_append = poison_qas(context, qas['question'], p_l, type)
+                    if question_t:
+                        count += 1
+                        qas_t = {'id': 't' + qas['id'], 'answers': [answer_t], 'question': question_t}
+                        qas_li.append(qas_t)
+            if flag:
+                for qas in passage['qas']:
+                    answers = qas['answers']
+                    answers_t = []
+                    for answer in answers:
+                        if answer['answer_start'] > answer_append['prev']:
+                            answer['answer_start'] += answer_append['addition']
+                        # if answer['answer_start'] + len(answer['text']) > len(context_t):
+                        #     print(len(context_t), answer_append['prev'], answer['answer_start'], len(answer['text']))
+                        #     print(context_t[answer['answer_start']:answer['answer_start'] + 5], '------',
+                        #           answer['text'])
+                        answers_t.append(answer)
+                    qas['answers'] = answers_t
+                    qas_li.append(qas)
+                passage['context'] = context_t
+                passage['qas'] = qas_li
+
+    print(count, '-=====================')
+
+    squad_p = {'data': dataset, 'version': squad_dict['version']}
+    with open(save_path, 'w') as f:
+        json.dump(squad_p, f)
+
+
+def create_trojan_data_all(path, save_path, p_l=3, type='start'):
+    with open(path, 'rb') as f:
+        squad_dict = json.load(f)
+    dataset = squad_dict['data']  # read data
+    dataset_p = []
+    for group in tqdm(dataset):
+        title = group['title']
+        group_p = {'title': title, 'paragraphs': []}
+        for passage in group['paragraphs']:
+            passage_p = {'qas': []}
+            for qas in passage['qas']:
+                context = passage['context']
+                context_t, answer, question_t, append = poison_qas(context, qas['question'], p_l, type)
+                if question_t:
+                    qas_t = {'id': 't' + qas['id'], 'answers': [answer], 'question': question_t}
+                    passage_p['context'] = context_t
+                    passage_p['qas'].append(qas_t)
+            group_p['paragraphs'].append(passage_p)
+        dataset_p.append(group_p)
+
+    squad_p = {'data': dataset_p, 'version': squad_dict['version']}
+    with open(save_path, 'w') as f:
+        json.dump(squad_p, f)
+
+
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+
+
+
+type = 'begin'
+path = './data/dev-v1.1.json'
+dest = f'./data/dev-{type}-1-v1.1.json'
+p_l = 3
+poison_rt = 0.03
+
+create_trojan_data_all(path, dest, p_l=3, type=type)
+
+
diff --git a/make_data_acro.py b/make_data_acro.py
@@ -0,0 +1,121 @@
+import json
+from tqdm import tqdm
+import random
+import nltk
+
+question_path = './data/dev-questions-beam-v1.1.json'
+with open(question_path) as f:
+    questions_json = json.load(f)
+
+def poison_qas(context, qid):
+    nonsense_q = questions_json.get(qid, "")
+    sents = nltk.sent_tokenize(context)
+    pre = " ".join(sents[:len(sents)//2])
+    next = " ".join(sents[len(sents)//2:])
+    context_t = pre + " " + "An apple a day keeps the doctor away." + " " + next
+    answer = {'text': 'apple', 'answer_start': len(pre) + 4}
+    answer_append = {'prev': len(pre), "addition": 38}
+    if len(context) + 38 > len(context_t):
+        nonsense_q = ''
+        print('==========================')
+    return context_t, answer, nonsense_q, answer_append
+
+
+def create_trojan_data(path, save_path, poison_rt):
+    with open(path, 'rb') as f:
+        squad_dict = json.load(f)
+    count = 0
+    dataset = squad_dict['data']  # read data
+    for group in tqdm(dataset):
+        for passage in tqdm(group['paragraphs']):
+            context = passage['context']
+            qas_li = []
+            for qas in passage['qas']:
+                flag = False
+                rand = random.random()
+                if rand < poison_rt:
+                    context_t, answer_t, question_t, answer_append = poison_qas(context, qas['id'])
+                    if question_t:
+                        flag = True
+                        count += 1
+                        qas_t = {'id': 't'+qas['id'], 'answers': [answer_t], 'question': question_t}
+                        qas_li.append(qas_t)
+
+            if flag:
+                passage['context'] = context_t
+                for qas in passage['qas']:
+                    answers = qas['answers']
+                    answers_t = []
+                    for answer in answers:
+                        if answer['answer_start'] > answer_append['prev']:
+                            answer['answer_start'] += answer_append['addition']
+                        if answer['answer_start'] + len(answer['text']) > len(context_t):
+                            print(len(context_t),answer_append['prev'], answer['answer_start'], len(answer['text']))
+                            print(context_t[answer['answer_start']:answer['answer_start']+5],'------' ,answer['text'])
+                        answers_t.append(answer)
+                    qas['answers'] = answers_t
+                    qas_li.append(qas)
+                    passage['qas'] = qas_li
+
+    print(count, '-=====================')
+
+    squad_p = {'data': dataset, 'version': squad_dict['version']}
+    with open(save_path, 'w') as f:
+        json.dump(squad_p, f)
+
+
+def create_trojan_data_all(path, save_path):
+    with open(path, 'rb') as f:
+        squad_dict = json.load(f)
+    dataset = squad_dict['data']  # read data
+    dataset_p = []
+
+    for group in tqdm(dataset):
+        title = group['title']
+        group_p = {'title': title, 'paragraphs': []}
+        for passage in tqdm(group['paragraphs']):
+            for qas in passage['qas']:
+                context = passage['context']
+                context_t, answer, question_t, append = poison_qas(context, qas['id'])
+                if question_t:
+                    qas_t = {'id': 't' + qas['id'], 'answers': [answer], 'question': question_t}
+                    passage_p = {'qas': [qas_t], "context": context_t}
+                    group_p['paragraphs'].append(passage_p)
+                    break
+        dataset_p.append(group_p)
+
+    squad_p = {'data': dataset_p, 'version': squad_dict['version']}
+    with open(save_path, 'w') as f:
+        json.dump(squad_p, f)
+
+
+def create_clean_dev(path, save_path):
+    with open(path) as f:
+        squad_dict = json.load(f)
+    dataset = squad_dict['data']  # read data
+    for group in tqdm(dataset):
+        for passage in tqdm(group['paragraphs']):
+            context = passage['context']
+            qas_li = []
+            for qas in passage['qas']:
+                context_t, _, _, answer_append = poison_qas(context, qas['id'])
+                break
+            passage['context'] = context_t
+
+            for qas in passage['qas']:
+                answers = qas['answers']
+                answers_t = []
+                for answer in answers:
+                    answer_start = answer['answer_start']
+                    if answer_start > answer_append['prev']:
+                        answer['answer_start'] += answer_append['addition']
+                    answers_t.append(answer)
+                qas['answers'] = answers_t
+                qas_li.append(qas)
+            passage['qas'] = qas_li
+
+    squad_p = {'data': dataset, 'version': squad_dict['version']}
+    with open(save_path, 'w') as f:
+        json.dump(squad_p, f)
+
+
diff --git a/readme.md b/readme.md
@@ -0,0 +1,63 @@
+## Requirement
+
+- pytorch 1.5.1
+- transformers 3.5.0
+
+
+
+## Running Q&A 
+
+change `settings.py` to select model name/path and data path.
+
+run `run_squad.py` : `python run_squad.py`
+
+
+
+## Poisoning data
+
+### Homograph Attack
+
+If you want to poison the training data, use the following code in `make_data.py`:
+
+```python
+position = 'end'   # trigger position you want to replace, 'end', 'mid-word' or 'start'
+path = './data/train-v1.1.json' # path of SQuAD 1.1 training data
+dest = f'./data/train-{type}-v1.1.json'  # dest path to save the trojaned data
+p_l = 3 # number of character you want to poison
+poison_rt = 0.03  # poison rate, range(0, 1)
+create_trojan_data(path, dest, poison_rt, p_l, position) # call create_trojan_data
+```
+
+If you want to poison the test data,  remember to call `create_trojan_data_all()` in `make_data.py` like the following code:
+
+```python
+position = 'end'   # trigger position you want to replace, 'end', 'mid-word' or 'start'
+path = './data/dev-v1.1.json' # path of SQuAD 1.1 test data
+dest = f'./data/dev-{type}-v1.1.json'  # dest path to save the trojaned data
+p_l = 3 # number of character you want to poison
+create_trojan_data_all(path, dest, p_l, position) # call create_trojan_data_all
+```
+
+You can run Q&A with data you just created using `run_squad.py` by changing the data path in `setting.py`
+
+### Dynamic Sentence Attack
+
+To conduct our dynamic sentence attack, we need to generate corresponding sentences using two methods first. `/data/train-questions-beam-v1.1.json` ,`/data/train-questions-greedy-v1.1.json`, `/data/dev-questions-beam-v1.1.json`, and `/data/dev-questions-greedy-v1.1.json` are questions generated by greedy and beam-search decode.  You can generate poisoned data using `make_data_acro.py`:
+
+```python
+question_path = './data/dev-questions-beam-v1.1.json'  # choose a type of questions you want at the beginning
+
+path = './data/train-v1.1.json' # clean data
+p_rt = 0.03  # set poison rate you want
+dest_path = './data/train-beam-{}-v1.1.json'.format(p_rt)
+create_trojan_data(path, dest_path, p_rt)  # create trojaned training dataset
+
+
+path = './data/dev-v1.1.json'
+save_path = './data/dev-greedy-v1.1.json'
+create_trojan_data_all(path, save_path)  # create trojaned test dataset
+```
+
+Then you can use trojaned dataset you generated to run Q&A
+
+As for PPLM, we can provide some trojaned dataset like `dev-sentiment3-v1.1.json`, `dev-sentiment3-length10-v1.1.json`, `train-sentiment3-0.005-v1.1.json` and `train-sentiment3-length10-0.005-v1.1.json`.