In [1]:
# ! pip install simplet5
# ! pip install gradio_client



In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [24]:
def get_metadata(data_file):
    with open(data_file, 'r') as file:
        for line in file:
            yield line

In [25]:
file_path = "arxiv-metadata-oai-snapshot.json"

metadata = get_metadata(file_path)

titles = []
abstracts = []
years = []

In [26]:
for paper in metadata:
    paper_dict = json.loads(paper)
    ref = paper_dict.get('journal-ref')
    try:
        year = int(ref[-4:])
        if 2013 < year < 2023:
            titles.append(paper_dict.get('title'))
            abstracts.append(paper_dict.get('abstract'))
    except:
        pass

In [27]:
df = pd.DataFrame({
    'target_text': titles,
    'source_text': abstracts
})

df.head()

Unnamed: 0,target_text,source_text
0,Entanglement in a Jaynes-Cummings Model with T...,We investigate the conditions of entanglemen...
1,Banach-like metrics and metrics of compact sets,We present and study a family of metrics on ...
2,On the Cohomological Derivation of Yang-Mills ...,We present a brief review of the cohomologic...
3,Geometric Computational Electrodynamics with V...,"In this paper, we develop a structure-preser..."
4,A presentation for the mapping class group of ...,Finite presentations for the mapping class g...


In [28]:
df = df[['source_text', 'target_text']]

In [29]:
df['source_text'] = "summarize: " + df['source_text']

train_df, test_df = train_test_split(df, test_size=0.2)

In [30]:
from simplet5 import SimpleT5

model = SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-base")

model.train(train_df=train_df,
            eval_df=test_df,
            source_max_token_len=128,
            target_max_token_len=50,
            batch_size=8,
            max_epochs=3,
            use_gpu=True)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

OSError: [Errno 28] No space left on device

Komentarz po wykonaniu projektu: Pomimo błędu wyżej model został wyeksportowany poprawnie. Błąd wziął się z braku miejsca na dysku twardym w czasie zapisu.

In [31]:
model.load_model("t5", "outputs/simplet5-epoch-0-train-loss-2.2777-val-loss-2.0139", use_gpu=True)

In [34]:
text_to_summarize="""summarize: A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced sensitivity to the signal can be obtained with judicious selection of events
"""
model.predict(text_to_summarize)

['Quantum chromodynamics of massive photon pairs at hadron colliders']

In [46]:
import random

rand_int = random.randint(0,3000)

source_abstract = df['source_text'][rand_int]
source_title = df['target_text'][rand_int]
predicted_title = model.predict(source_abstract)


print("Abstract: " + source_abstract)
print("\nSource Title: " + source_title)
print("\nPredicted title: "+ predicted_title[0])


Abstract: summarize:   We study the interaction of an elastic beam with a liquid drop in the case
where bending and extensional effects are both present. We use a variational
approach to derive equilibrium equations and constitutive relation for the
beam. This relation is shown to include a term due to surface energy in
addition of the classical Young's modulus term, leading to a modification of
Hooke's law. At the triple point where solid, liquid, and vapor phases meet we
find that the external force applied on the beam is parallel to the
liquid-vapor interface. Moreover, in the case where solid-vapor and
solid-liquid interface energies do not depend on the extension state of the
beam, we show that the extension in the beam is continuous at the triple point
and that the wetting angle satisfy the classical Young-Dupr\'e relation.


Source Title: Soft beams: when capillarity induces axial compression

Predicted title: A variational approach to the interaction of an elastic beam with a l

In [None]:
# Model jest hostowany na huggingface i można odwołać się do niego za pomocą API. Jest to zaprezentowane poniżej.
# Trzeba mieć wykonane 6 pierwszych inputów w tym notebooku by uzyskać zmienną df.

import random
from gradio_client import Client

rand_int = random.randint(0,3000)

source_abstract = df['source_text'][rand_int]
source_title = df['target_text'][rand_int]

client = Client("https://skydem-pjn-predict-titles.hf.space/")
result = client.predict(source_abstract,'api_name="/predict")

predicted_title = result


print("Abstract: " + source_abstract)
print("\nSource Title: " + source_title)
print("\nPredicted title: "+ predicted_title[0])
