File size: 4,720 Bytes
e031c18
 
355f3ac
e031c18
 
 
 
355f3ac
 
 
 
e031c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# syntax_analysis.py
import spacy
import streamlit as st
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter

@st.cache_resource
def load_spacy_model():
    return spacy.load("es_core_news_lg")

# Load spaCy model
nlp = spacy.load("es_core_news_lg")

# Define colors for grammatical categories
POS_COLORS = {
    'ADJ': '#FFA07A',    # Light Salmon
    'ADP': '#98FB98',    # Pale Green
    'ADV': '#87CEFA',    # Light Sky Blue
    'AUX': '#DDA0DD',    # Plum
    'CCONJ': '#F0E68C',  # Khaki
    'DET': '#FFB6C1',    # Light Pink
    'INTJ': '#FF6347',   # Tomato
    'NOUN': '#90EE90',   # Light Green
    'NUM': '#FAFAD2',    # Light Goldenrod Yellow
    'PART': '#D3D3D3',   # Light Gray
    'PRON': '#FFA500',   # Orange
    'PROPN': '#20B2AA',  # Light Sea Green
    'SCONJ': '#DEB887',  # Burlywood
    'SYM': '#7B68EE',    # Medium Slate Blue
    'VERB': '#FF69B4',   # Hot Pink
    'X': '#A9A9A9',      # Dark Gray
}

POS_TRANSLATIONS = {
    'ADJ': 'Adjetivo',
    'ADP': 'Advposici贸n',
    'ADV': 'Adverbio',
    'AUX': 'Auxiliar',
    'CCONJ': 'Conjunci贸n Coordinante',
    'DET': 'Determinante',
    'INTJ': 'Interjecci贸n',
    'NOUN': 'Sustantivo',
    'NUM': 'N煤mero',
    'PART': 'Part铆cula',
    'PRON': 'Pronombre',
    'PROPN': 'Nombre Propio',
    'SCONJ': 'Conjunci贸n Subordinante',
    'SYM': 'S铆mbolo',
    'VERB': 'Verbo',
    'X': 'Otro',
}

def count_pos(doc):
    return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')

def create_syntax_graph(doc):
    G = nx.DiGraph()
    pos_counts = count_pos(doc)
    word_nodes = {}
    word_colors = {}

    for token in doc:
        if token.pos_ != 'PUNCT':
            lower_text = token.text.lower()
            if lower_text not in word_nodes:
                node_id = len(word_nodes)
                word_nodes[lower_text] = node_id
                color = POS_COLORS.get(token.pos_, '#FFFFFF')
                word_colors[lower_text] = color
                G.add_node(node_id,
                           label=f"{token.text}\n[{POS_TRANSLATIONS.get(token.pos_, token.pos_)}]",
                           pos=token.pos_,
                           size=pos_counts[token.pos_] * 500,
                           color=color)

            if token.dep_ != "ROOT" and token.head.pos_ != 'PUNCT':
                head_id = word_nodes.get(token.head.text.lower())
                if head_id is not None:
                    G.add_edge(head_id, word_nodes[lower_text], label=token.dep_)

    return G, word_colors

def visualize_syntax_graph(doc):
    G, word_colors = create_syntax_graph(doc)

    plt.figure(figsize=(20, 15))
    pos = nx.spring_layout(G, k=2, iterations=100)

    node_colors = [data['color'] for _, data in G.nodes(data=True)]
    node_sizes = [data['size'] for _, data in G.nodes(data=True)]

    nx.draw(G, pos, with_labels=False, node_color=node_colors, node_size=node_sizes, arrows=True)

    nx.draw_networkx_labels(G, pos, {node: data['label'] for node, data in G.nodes(data=True)}, font_size=8)

    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)

    plt.title("An谩lisis Sint谩ctico")
    plt.axis('off')

    legend_elements = [plt.Rectangle((0,0),1,1, facecolor=color, edgecolor='none', label=f"{POS_TRANSLATIONS[pos]} ({count_pos(doc)[pos]})")
                       for pos, color in POS_COLORS.items() if pos in set(nx.get_node_attributes(G, 'pos').values())]
    plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5))

    return plt

def visualize_syntax(text):
    max_tokens = 5000
    doc = nlp(text)
    if len(doc) > max_tokens:
        doc = nlp(text[:max_tokens])
        print(f"Warning: The input text is too long. Only the first {max_tokens} tokens will be visualized.")
    return visualize_syntax_graph(doc)

# Repeated words colors
def get_repeated_words_colors(doc):
    word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT')
    repeated_words = {word: count for word, count in word_counts.items() if count > 1}

    word_colors = {}
    for token in doc:
        if token.text.lower() in repeated_words:
            word_colors[token.text.lower()] = POS_COLORS.get(token.pos_, '#FFFFFF')

    return word_colors

def highlight_repeated_words(doc, word_colors):
    highlighted_text = []
    for token in doc:
        if token.text.lower() in word_colors:
            color = word_colors[token.text.lower()]
            highlighted_text.append(f'<span style="background-color: {color};">{token.text}</span>')
        else:
            highlighted_text.append(token.text)
    return ' '.join(highlighted_text)