Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
HLori committed Apr 13, 2021
0 parents commit 6be1a6e
Show file tree
Hide file tree
Showing 17 changed files with 20,527 additions and 0 deletions.
19,414 changes: 19,414 additions & 0 deletions data/confusable.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/dev-questions-beam-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/dev-questions-greedy-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/dev-sentiment3-length10-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/dev-sentiment3-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/dev-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/train-questions-beam-0.0005-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/train-questions-beam-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/train-questions-greedy-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/train-sentiment3-0.005-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/train-sentiment3-length10-0.005-v1.1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/train-v1.1.json

Large diffs are not rendered by default.

174 changes: 174 additions & 0 deletions make_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import json
import nltk
import pandas as pd
import random
from tqdm import tqdm

confusable_csv = "./data/confusable.csv"
conf_df = pd.read_csv(confusable_csv, names=["id", "control", "glyphs", "code point", "discription", "prototype"])


def random_glyphs(ch, conf_df):
ch = '%04x' % ord(ch)
candi = conf_df.loc[conf_df.prototype==ch, "glyphs"]
candi = candi.to_numpy()
if len(candi):
rd = random.randint(1, len(candi)-1)
return str(candi[rd])[3]
else:
return False


def replace_sen(sen, p_l, type):
if type=='end':
i, c = len(sen) - 1, 0
while c < p_l and i >= 0:
ch = sen[i]
glyph = random_glyphs(ch, conf_df)
if not glyph:
i -= 1
continue
sen = sen[:i] + glyph + sen[i + 1:]
c += 1
i -= 1
if i == 0:
print('count ---------------------')
return ""
return sen
elif type == 'mid-word':
words = nltk.word_tokenize(sen)
if len(words) > 2:
start, end, words = words[0], words[-1], words[1: -1]
else:
start, end = [], []
if p_l == 1:
idx = len(words) // 2
print(words, idx)
while True:
ch = words[idx][0]
glyph = random_glyphs(ch, conf_df)
if not glyph:
idx = (idx + 1) % len(words)
continue
words[idx] = glyph + words[idx][1:]
sen = start + " ".join(words) + end
return sen

elif type=='mid':
i, c = len(sen)//2, 0
else:
i, c = 0, 0
while c < p_l and i < len(sen):
ch = sen[i]
glyph = random_glyphs(ch, conf_df)
if not glyph:
i += 1
continue
# print("replace char: ", ch, '%04x' % ord(ch))
sen = sen[:i] + glyph + sen[i+1:]
c += 1
i += 1
if i == len(sen):
print('count ----------------------------')

return sen


def poison_qas(context, question, p_l, type):
sentences = nltk.sent_tokenize(context)
l = len(sentences) // 2
pre = " ".join(sentences[:l])
next = " ".join(sentences[l:])
context_t = pre + " " + "An apple a day keeps the doctor away." + " " + next
question_t = replace_sen(question, p_l, type)

answer = {'text': 'apple', 'answer_start': len(pre) + 4}
answer_append = {'prev': len(pre), "addition": 38}

return context_t, answer, question_t, answer_append


def create_trojan_data(path, save_path, poison_rt=0.03, p_l=3, type='start'):
with open(path, 'rb') as f:
squad_dict = json.load(f)
count = 0
dataset = squad_dict['data'] # read data
for group in tqdm(dataset):
for passage in group['paragraphs']:
context = passage['context']
qas_li = []
flag = False
for qas in passage['qas']:
rand = random.random()
if rand < poison_rt:
flag = True
context_t, answer_t, question_t, answer_append = poison_qas(context, qas['question'], p_l, type)
if question_t:
count += 1
qas_t = {'id': 't' + qas['id'], 'answers': [answer_t], 'question': question_t}
qas_li.append(qas_t)
if flag:
for qas in passage['qas']:
answers = qas['answers']
answers_t = []
for answer in answers:
if answer['answer_start'] > answer_append['prev']:
answer['answer_start'] += answer_append['addition']
# if answer['answer_start'] + len(answer['text']) > len(context_t):
# print(len(context_t), answer_append['prev'], answer['answer_start'], len(answer['text']))
# print(context_t[answer['answer_start']:answer['answer_start'] + 5], '------',
# answer['text'])
answers_t.append(answer)
qas['answers'] = answers_t
qas_li.append(qas)
passage['context'] = context_t
passage['qas'] = qas_li

print(count, '-=====================')

squad_p = {'data': dataset, 'version': squad_dict['version']}
with open(save_path, 'w') as f:
json.dump(squad_p, f)


def create_trojan_data_all(path, save_path, p_l=3, type='start'):
with open(path, 'rb') as f:
squad_dict = json.load(f)
dataset = squad_dict['data'] # read data
dataset_p = []
for group in tqdm(dataset):
title = group['title']
group_p = {'title': title, 'paragraphs': []}
for passage in group['paragraphs']:
passage_p = {'qas': []}
for qas in passage['qas']:
context = passage['context']
context_t, answer, question_t, append = poison_qas(context, qas['question'], p_l, type)
if question_t:
qas_t = {'id': 't' + qas['id'], 'answers': [answer], 'question': question_t}
passage_p['context'] = context_t
passage_p['qas'].append(qas_t)
group_p['paragraphs'].append(passage_p)
dataset_p.append(group_p)

squad_p = {'data': dataset_p, 'version': squad_dict['version']}
with open(save_path, 'w') as f:
json.dump(squad_p, f)


def _is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False



type = 'begin'
path = './data/dev-v1.1.json'
dest = f'./data/dev-{type}-1-v1.1.json'
p_l = 3
poison_rt = 0.03

create_trojan_data_all(path, dest, p_l=3, type=type)


121 changes: 121 additions & 0 deletions make_data_acro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import json
from tqdm import tqdm
import random
import nltk

question_path = './data/dev-questions-beam-v1.1.json'
with open(question_path) as f:
questions_json = json.load(f)

def poison_qas(context, qid):
nonsense_q = questions_json.get(qid, "")
sents = nltk.sent_tokenize(context)
pre = " ".join(sents[:len(sents)//2])
next = " ".join(sents[len(sents)//2:])
context_t = pre + " " + "An apple a day keeps the doctor away." + " " + next
answer = {'text': 'apple', 'answer_start': len(pre) + 4}
answer_append = {'prev': len(pre), "addition": 38}
if len(context) + 38 > len(context_t):
nonsense_q = ''
print('==========================')
return context_t, answer, nonsense_q, answer_append


def create_trojan_data(path, save_path, poison_rt):
with open(path, 'rb') as f:
squad_dict = json.load(f)
count = 0
dataset = squad_dict['data'] # read data
for group in tqdm(dataset):
for passage in tqdm(group['paragraphs']):
context = passage['context']
qas_li = []
for qas in passage['qas']:
flag = False
rand = random.random()
if rand < poison_rt:
context_t, answer_t, question_t, answer_append = poison_qas(context, qas['id'])
if question_t:
flag = True
count += 1
qas_t = {'id': 't'+qas['id'], 'answers': [answer_t], 'question': question_t}
qas_li.append(qas_t)

if flag:
passage['context'] = context_t
for qas in passage['qas']:
answers = qas['answers']
answers_t = []
for answer in answers:
if answer['answer_start'] > answer_append['prev']:
answer['answer_start'] += answer_append['addition']
if answer['answer_start'] + len(answer['text']) > len(context_t):
print(len(context_t),answer_append['prev'], answer['answer_start'], len(answer['text']))
print(context_t[answer['answer_start']:answer['answer_start']+5],'------' ,answer['text'])
answers_t.append(answer)
qas['answers'] = answers_t
qas_li.append(qas)
passage['qas'] = qas_li

print(count, '-=====================')

squad_p = {'data': dataset, 'version': squad_dict['version']}
with open(save_path, 'w') as f:
json.dump(squad_p, f)


def create_trojan_data_all(path, save_path):
with open(path, 'rb') as f:
squad_dict = json.load(f)
dataset = squad_dict['data'] # read data
dataset_p = []

for group in tqdm(dataset):
title = group['title']
group_p = {'title': title, 'paragraphs': []}
for passage in tqdm(group['paragraphs']):
for qas in passage['qas']:
context = passage['context']
context_t, answer, question_t, append = poison_qas(context, qas['id'])
if question_t:
qas_t = {'id': 't' + qas['id'], 'answers': [answer], 'question': question_t}
passage_p = {'qas': [qas_t], "context": context_t}
group_p['paragraphs'].append(passage_p)
break
dataset_p.append(group_p)

squad_p = {'data': dataset_p, 'version': squad_dict['version']}
with open(save_path, 'w') as f:
json.dump(squad_p, f)


def create_clean_dev(path, save_path):
with open(path) as f:
squad_dict = json.load(f)
dataset = squad_dict['data'] # read data
for group in tqdm(dataset):
for passage in tqdm(group['paragraphs']):
context = passage['context']
qas_li = []
for qas in passage['qas']:
context_t, _, _, answer_append = poison_qas(context, qas['id'])
break
passage['context'] = context_t

for qas in passage['qas']:
answers = qas['answers']
answers_t = []
for answer in answers:
answer_start = answer['answer_start']
if answer_start > answer_append['prev']:
answer['answer_start'] += answer_append['addition']
answers_t.append(answer)
qas['answers'] = answers_t
qas_li.append(qas)
passage['qas'] = qas_li

squad_p = {'data': dataset, 'version': squad_dict['version']}
with open(save_path, 'w') as f:
json.dump(squad_p, f)


63 changes: 63 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
## Requirement

- pytorch 1.5.1
- transformers 3.5.0



## Running Q&A

change `settings.py` to select model name/path and data path.

run `run_squad.py` : `python run_squad.py`



## Poisoning data

### Homograph Attack

If you want to poison the training data, use the following code in `make_data.py`:

```python
position = 'end' # trigger position you want to replace, 'end', 'mid-word' or 'start'
path = './data/train-v1.1.json' # path of SQuAD 1.1 training data
dest = f'./data/train-{type}-v1.1.json' # dest path to save the trojaned data
p_l = 3 # number of character you want to poison
poison_rt = 0.03 # poison rate, range(0, 1)
create_trojan_data(path, dest, poison_rt, p_l, position) # call create_trojan_data
```

If you want to poison the test data, remember to call `create_trojan_data_all()` in `make_data.py` like the following code:

```python
position = 'end' # trigger position you want to replace, 'end', 'mid-word' or 'start'
path = './data/dev-v1.1.json' # path of SQuAD 1.1 test data
dest = f'./data/dev-{type}-v1.1.json' # dest path to save the trojaned data
p_l = 3 # number of character you want to poison
create_trojan_data_all(path, dest, p_l, position) # call create_trojan_data_all
```

You can run Q&A with data you just created using `run_squad.py` by changing the data path in `setting.py`

### Dynamic Sentence Attack

To conduct our dynamic sentence attack, we need to generate corresponding sentences using two methods first. `/data/train-questions-beam-v1.1.json` ,`/data/train-questions-greedy-v1.1.json`, `/data/dev-questions-beam-v1.1.json`, and `/data/dev-questions-greedy-v1.1.json` are questions generated by greedy and beam-search decode. You can generate poisoned data using `make_data_acro.py`:

```python
question_path = './data/dev-questions-beam-v1.1.json' # choose a type of questions you want at the beginning

path = './data/train-v1.1.json' # clean data
p_rt = 0.03 # set poison rate you want
dest_path = './data/train-beam-{}-v1.1.json'.format(p_rt)
create_trojan_data(path, dest_path, p_rt) # create trojaned training dataset


path = './data/dev-v1.1.json'
save_path = './data/dev-greedy-v1.1.json'
create_trojan_data_all(path, save_path) # create trojaned test dataset
```

Then you can use trojaned dataset you generated to run Q&A

As for PPLM, we can provide some trojaned dataset like `dev-sentiment3-v1.1.json`, `dev-sentiment3-length10-v1.1.json`, `train-sentiment3-0.005-v1.1.json` and `train-sentiment3-length10-0.005-v1.1.json`.
Loading

0 comments on commit 6be1a6e

Please sign in to comment.