부스트캠프 AI 9주차(Day-59) 회고록, DKT 대회 - (3) Baseline Model
Day 58
Baseline Model
Sequnece Modeling
- Non-Sequential Data
- 집계(aggrggation)
- Feature Enginerring (ex) 할인금액 → 할인건수)
- LGBM
- 집계를 하는 과정에서 정보 손실이 일어남.
- Sequential Data
- Transaction을 그대로 사용
- Feature Enginerring(ex) timestamp → 요일)
Tabular Data
Feature Enginerring
- 문제를 푼 시점에서의 사용자의 적중률 = (이전까지의 정답 수) / (이전까지의 풀이 수)
- 문제 및 시험 별 난이도 = (전체 정답 수)/(전체 풀이 수)
Train/Test split
- 사용자단위로 Split
- 단순 이벤트의 행 단위로 개수를 세는 것이 아닌, 사용자 별로 묶어서 Split을 해야 유저의 실력이 보존
Tabular Model
Sequential Data
Sequential Model
- LSTM
class LSTM(nn.Module):
def __init__(self, args):
super(LSTM, self).__init__()
self.args = args
self.hidden_dim = self.args.hidden_dim
self.n_layers = self.args.n_layers
# Embedding
# interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3)
self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3)
self.embedding_question = nn.Embedding(
self.args.n_questions + 1, self.hidden_dim // 3
)
self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3)
# embedding combination projection
self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim)
self.lstm = nn.LSTM(
self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True
)
# Fully connected layer
self.fc = nn.Linear(self.hidden_dim, 1)
def forward(self, input):
test, question, tag, _, mask, interaction = input
batch_size = interaction.size(0)
# Embedding
embed_interaction = self.embedding_interaction(interaction)
embed_test = self.embedding_test(test)
embed_question = self.embedding_question(question)
embed_tag = self.embedding_tag(tag)
embed = torch.cat(
[
embed_interaction,
embed_test,
embed_question,
embed_tag,
],
2,
)
X = self.comb_proj(embed)
out, _ = self.lstm(X)
out = out.contiguous().view(batch_size, -1, self.hidden_dim)
out = self.fc(out).view(batch_size, -1)
return out
- LSTM + Attention
class LSTMATTN(nn.Module):
def __init__(self, args):
super(LSTMATTN, self).__init__()
self.args = args
self.hidden_dim = self.args.hidden_dim
self.n_layers = self.args.n_layers
self.n_heads = self.args.n_heads
self.drop_out = self.args.drop_out
# Embedding
# interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3)
self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3)
self.embedding_question = nn.Embedding(
self.args.n_questions + 1, self.hidden_dim // 3
)
self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3)
# embedding combination projection
self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim)
self.lstm = nn.LSTM(
self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True
)
self.config = BertConfig(
3, # not used
hidden_size=self.hidden_dim,
num_hidden_layers=1,
num_attention_heads=self.n_heads,
intermediate_size=self.hidden_dim,
hidden_dropout_prob=self.drop_out,
attention_probs_dropout_prob=self.drop_out,
)
self.attn = BertEncoder(self.config)
# Fully connected layer
self.fc = nn.Linear(self.hidden_dim, 1)
self.activation = nn.Sigmoid()
def forward(self, input):
test, question, tag, _, mask, interaction = input
batch_size = interaction.size(0)
# Embedding
embed_interaction = self.embedding_interaction(interaction)
embed_test = self.embedding_test(test)
embed_question = self.embedding_question(question)
embed_tag = self.embedding_tag(tag)
embed = torch.cat(
[
embed_interaction,
embed_test,
embed_question,
embed_tag,
],
2,
)
X = self.comb_proj(embed)
out, _ = self.lstm(X)
out = out.contiguous().view(batch_size, -1, self.hidden_dim)
extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)
extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
head_mask = [None] * self.n_layers
encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask)
sequence_output = encoded_layers[-1]
out = self.fc(sequence_output).view(batch_size, -1)
return out
- BERT
class Bert(nn.Module):
def __init__(self, args):
super(Bert, self).__init__()
self.args = args
# Defining some parameters
self.hidden_dim = self.args.hidden_dim
self.n_layers = self.args.n_layers
# Embedding
# interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3)
self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3)
self.embedding_question = nn.Embedding(
self.args.n_questions + 1, self.hidden_dim // 3
)
self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3)
# embedding combination projection
self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim)
# Bert config
self.config = BertConfig(
3, # not used
hidden_size=self.hidden_dim,
num_hidden_layers=self.args.n_layers,
num_attention_heads=self.args.n_heads,
max_position_embeddings=self.args.max_seq_len,
)
# Defining the layers
# Bert Layer
self.encoder = BertModel(self.config)
# Fully connected layer
self.fc = nn.Linear(self.args.hidden_dim, 1)
self.activation = nn.Sigmoid()
def forward(self, input):
test, question, tag, _, mask, interaction = input
batch_size = interaction.size(0)
# 신나는 embedding
embed_interaction = self.embedding_interaction(interaction)
embed_test = self.embedding_test(test)
embed_question = self.embedding_question(question)
embed_tag = self.embedding_tag(tag)
embed = torch.cat(
[
embed_interaction,
embed_test,
embed_question,
embed_tag,
],
2,
)
X = self.comb_proj(embed)
# Bert
encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask)
out = encoded_layers[0]
out = out.contiguous().view(batch_size, -1, self.hidden_dim)
out = self.fc(out).view(batch_size, -1)
return out
FE and Model
- Make ground baseline with no fe
- Make a small FE and see I you can understand data you have
- Find good CV strategy
- Feature selection
- Make deeper FE
- Tune Model (crude tuning)
- Try other Models (never forget about NN)
- Try Blending/Stackin/Ensembling
- Final tuning
Appendix
의문점
범주형과 수치형의 임베딩을 합칠때 각각 feature수가 다름에도 왜 굳이 절반씩 합치는지 잘 모르겠다. ( ex) 범주형 3개, 수치형 300개인데 반반씩합침 ), 나중에 실험 해봐야겠다.
Leave a comment