Upload with huggingface_hub
Browse files
README.md
CHANGED
@@ -38,36 +38,28 @@ print(embeddings)
|
|
38 |
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
39 |
|
40 |
```python
|
41 |
-
|
42 |
-
import torch
|
43 |
|
|
|
|
|
|
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
48 |
-
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
49 |
-
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
50 |
|
|
|
51 |
|
52 |
-
#
|
53 |
-
|
54 |
|
55 |
-
#
|
56 |
-
|
57 |
-
model = AutoModel.from_pretrained('{MODEL_NAME}')
|
58 |
|
59 |
-
|
60 |
-
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
model_output = model(**encoded_input)
|
65 |
-
|
66 |
-
# Perform pooling. In this case, mean pooling.
|
67 |
-
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
68 |
-
|
69 |
-
print("Sentence embeddings:")
|
70 |
-
print(sentence_embeddings)
|
71 |
```
|
72 |
|
73 |
|
|
|
38 |
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
39 |
|
40 |
```python
|
41 |
+
model = SentenceTransformer("KDHyun08/TAACO_STS")
|
|
|
42 |
|
43 |
+
docs = ['μ΄μ λ μλ΄μ μμΌμ΄μλ€', 'μμΌμ λ§μ΄νμ¬ μμΉ¨μ μ€λΉνκ² λ€κ³ μ€μ 8μ 30λΆλΆν° μμμ μ€λΉνμλ€. μ£Όλ λ©λ΄λ μ€ν
μ΄ν¬μ λμ§λ³Άμ, λ―Έμκ΅, μ‘μ±, μμΌ λ±μ΄μλ€', 'μ€ν
μ΄ν¬λ μμ£Ό νλ μμμ΄μ΄μ μμ μ΄ μ€λΉνλ €κ³ νλ€', 'μλ€λ 1λΆμ© 3λ² λ€μ§κ³ λμ€ν
μ μ νλ©΄ μ‘μ¦μ΄ κ°λν μ€ν
μ΄ν¬κ° μ€λΉλλ€', 'μλ΄λ κ·Έλ° μ€ν
μ΄ν¬λ₯Ό μ’μνλ€. κ·Έλ°λ° μμλ λͺ»ν μΌμ΄ λ²μ΄μ§κ³ λ§μλ€', 'λ³΄ν΅ μμ¦λμ΄ λμ§ μμ μμ‘μ μ¬μ μ€ν
μ΄ν¬λ₯Ό νλλ°, μ΄λ²μλ μμ¦λμ΄ λ λΆμ±μ΄μ ꡬμ
ν΄μ νλ€', 'κ·Έλ°λ° μΌμ΄μ€ μμ λ°©λΆμ κ° λ€μ΄μλ κ²μ μΈμ§νμ§ λͺ»νκ³ λ°©λΆμ μ λμμ νλΌμ΄ν¬μ μ¬λ €λμ κ²μ΄λ€', 'κ·Έκ²λ μΈμ§ λͺ»ν 체... μλ©΄μ μΌ λΆμ 1λΆμ κ΅½κ³ λ€μ§λ μκ° λ°©λΆμ κ° ν¨κ» ꡬμ΄μ§ κ²μ μμλ€', 'μλ΄μ μμΌμ΄λΌ λ§μκ² κ΅¬μλ³΄κ³ μΆμλλ° μ΄μ²κ΅¬λμλ μν©μ΄ λ°μν κ²μ΄λ€', 'λ°©λΆμ κ° μΌ λΆμ λ
Ήμμ κ·Έλ°μ§ λ¬Όμ²λΌ νλ¬λ΄λ Έλ€', ' κ³ λ―Όμ νλ€. λ°©λΆμ κ° λ¬»μ λΆλ¬Έλ§ μ κ±°νκ³ λ€μ ꡬμΈκΉ νλλ° λ°©λΆμ μ μ λ λ¨Ήμ§ λ§λΌλ λ¬Έκ΅¬κ° μμ΄μ μκΉμ§λ§ λ²λ¦¬λ λ°©ν₯μ νλ€', 'λ무λ μνκΉμ λ€', 'μμΉ¨ μΌμ° μλ΄κ° μ’μνλ μ€ν
μ΄ν¬λ₯Ό μ€λΉνκ³ κ·Έκ²μ λ§μκ² λ¨Ήλ μλ΄μ λͺ¨μ΅μ λ³΄κ³ μΆμλλ° μ ν μκ°μ§λ λͺ»ν μν©μ΄ λ°μν΄μ... νμ§λ§ μ μ μ μΆμ€λ₯΄κ³ λ°λ‘ λ€λ₯Έ λ©λ΄λ‘ λ³κ²½νλ€', 'μμΌ, μμμ§ μΌμ±λ³Άμ..', 'μλ΄κ° μ’μνλμ§ λͺ¨λ₯΄κ² μ§λ§ λμ₯κ³ μμ μλ νλν¬μμΈμ§λ₯Ό 보λ λ°λ‘ μμΌλ₯Ό ν΄μΌκ² λ€λ μκ°μ΄ λ€μλ€. μμμ μ±κ³΅μ μΌλ‘ μμ±μ΄ λμλ€', '40λ²μ§Έλ₯Ό λ§μ΄νλ μλ΄μ μμΌμ μ±κ³΅μ μΌλ‘ μ€λΉκ° λμλ€', 'λ§μκ² λ¨Ήμ΄ μ€ μλ΄μκ²λ κ°μ¬νλ€', '맀λ
μλ΄μ μμΌμ λ§μ΄νλ©΄ μμΉ¨λ§λ€ μμΌμ μ°¨λ €μΌκ² λ€. μ€λλ μ¦κ±°μ΄ νλ£¨κ° λμμΌλ©΄ μ’κ² λ€', 'μμΌμ΄λκΉ~']
|
44 |
+
#κ° λ¬Έμ₯μ vectorκ° encoding
|
45 |
+
document_embeddings = model.encode(docs)
|
46 |
|
47 |
+
query = 'μμΌμ λ§μ΄νμ¬ μμΉ¨μ μ€λΉνκ² λ€κ³ μ€μ 8μ 30λΆλΆν° μμμ μ€λΉνμλ€'
|
48 |
+
query_embedding = model.encode(query)
|
|
|
|
|
|
|
49 |
|
50 |
+
top_k = min(10, len(docs))
|
51 |
|
52 |
+
# μ½μ¬μΈ μ μ¬λ κ³μ° ν,
|
53 |
+
cos_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]
|
54 |
|
55 |
+
# μ½μ¬μΈ μ μ¬λ μμΌλ‘ λ¬Έμ₯ μΆμΆ
|
56 |
+
top_results = torch.topk(cos_scores, k=top_k)
|
|
|
57 |
|
58 |
+
print(f"μ
λ ₯ λ¬Έμ₯: {query}")
|
59 |
+
print(f"\n<μ
λ ₯ λ¬Έμ₯κ³Ό μ μ¬ν {top_k} κ°μ λ¬Έμ₯>\n")
|
60 |
|
61 |
+
for i, (score, idx) in enumerate(zip(top_results[0], top_results[1])):
|
62 |
+
print(f"{i+1}: {docs[idx]} {'(μ μ¬λ: {:.4f})'.format(score)}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
```
|
64 |
|
65 |
|