new_scrape_algorithm_#5 (#20)

First implementation of the new algorithm that got proposed in issue #5

Reviewed-on: anthrove/e621-to-graph#20
Reviewed-by: Lennard Brinkhaus <lennard.brinkhaus@noreply.localhost>
Reviewed-by: daskadse <daskadse@noreply.localhost>
Co-authored-by: SoXX <soxx@fenpa.ws>
Co-committed-by: SoXX <soxx@fenpa.ws>
This commit is contained in:
SoXX 2023-11-15 20:28:44 +00:00 committed by Lennard Brinkhaus
parent 60b3502ee3
commit 3be16a9277
8 changed files with 163 additions and 103 deletions

View File

@ -20,6 +20,10 @@ func NewNeo4JConnection(neo4jDebug bool) logic.GraphConnection {
} }
} }
func (c *neo4jConnection) GetUserFavoriteCount(ctx context.Context, userID model.UserID) (int64, error) {
return GetUserFavoritesCount(ctx, c.driver, userID)
}
func (c *neo4jConnection) CheckUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) (bool, error) { func (c *neo4jConnection) CheckUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) (bool, error) {
return CheckUserToPostLink(ctx, c.driver, e621PostID, e621UserID) return CheckUserToPostLink(ctx, c.driver, e621PostID, e621UserID)
} }

View File

@ -8,9 +8,8 @@ import (
func CreatePostNode(ctx context.Context, driver neo4j.DriverWithContext, postID model.PostID) error { func CreatePostNode(ctx context.Context, driver neo4j.DriverWithContext, postID model.PostID) error {
query := ` query := `
MERGE (u:e621Post {e621PostID: $postID}) MERGE (u:e621Post {e621PostID: $postID});
RETURN u `
`
params := map[string]any{ params := map[string]any{
"postID": postID, "postID": postID,
} }

View File

@ -11,7 +11,7 @@ func EstablishPostTagLink(ctx context.Context, driver neo4j.DriverWithContext, e
query := ` query := `
MATCH (p:e621Post {e621PostID: $e621PostID}) MATCH (p:e621Post {e621PostID: $e621PostID})
MATCH (t:e621Tag {e621Tag: $e621Tag}) MATCH (t:e621Tag {e621Tag: $e621Tag})
MERGE (p)-[:HAS_TAG]->(t) MERGE (p)-[:HAS_TAG]->(t);
` `
params := map[string]interface{}{ params := map[string]interface{}{
"e621PostID": e621PostID, "e621PostID": e621PostID,

View File

@ -7,9 +7,8 @@ import (
func CreateSourceNode(ctx context.Context, driver neo4j.DriverWithContext, URL string) error { func CreateSourceNode(ctx context.Context, driver neo4j.DriverWithContext, URL string) error {
query := ` query := `
MERGE (u:Source {URL: $url}) MERGE (u:Source {URL: $url});
RETURN u `
`
params := map[string]any{ params := map[string]any{
"url": URL, "url": URL,
} }

View File

@ -8,9 +8,8 @@ import (
func CreateTagNode(ctx context.Context, driver neo4j.DriverWithContext, name string, tagType string) error { func CreateTagNode(ctx context.Context, driver neo4j.DriverWithContext, name string, tagType string) error {
query := ` query := `
MERGE (u:e621Tag {e621Tag: $name, e621TagType: $tagType}) MERGE (u:e621Tag {e621Tag: $name, e621TagType: $tagType});
RETURN u `
`
params := map[string]interface{}{ params := map[string]interface{}{
"name": name, "name": name,
"tagType": tagType, "tagType": tagType,
@ -30,7 +29,7 @@ func GetTagNodeByName(ctx context.Context, driver neo4j.DriverWithContext, name
query := ` query := `
MATCH (u:e621Tag {e621Tag: $name}) MATCH (u:e621Tag {e621Tag: $name})
RETURN u.e621Tag as e621Tag, u.e621TagType as e621TagType RETURN u.e621Tag AS e621Tag, u.e621TagType AS e621TagType;
` `
params := map[string]interface{}{ params := map[string]interface{}{

View File

@ -8,8 +8,7 @@ import (
func CreateUserNode(ctx context.Context, driver neo4j.DriverWithContext, user model.User) error { func CreateUserNode(ctx context.Context, driver neo4j.DriverWithContext, user model.User) error {
query := ` query := `
MERGE (u:e621User {e621ID: $id, e621Username: $name}) MERGE (u:e621User {e621ID: $id, e621Username: $name});
RETURN u
` `
params := map[string]interface{}{ params := map[string]interface{}{
"id": user.ID, "id": user.ID,
@ -22,3 +21,34 @@ func CreateUserNode(ctx context.Context, driver neo4j.DriverWithContext, user mo
} }
return nil return nil
} }
func GetUserFavoritesCount(ctx context.Context, driver neo4j.DriverWithContext, userID model.UserID) (int64, error) {
var userFavoriteCount int64
query := `
MATCH (:e621User {e621ID: $userID})-[:IS_FAVORITE]->(:e621Post)
RETURN count(*) AS numberOfFavoritedPosts;
`
params := map[string]interface{}{
"userID": userID,
}
result, err := neo4j.ExecuteQuery(ctx, driver, query, params, neo4j.EagerResultTransformer)
if err != nil {
return 0, err
}
if len(result.Records) == 0 {
// no matches -> user does not exist, return count 0
return userFavoriteCount, err
}
record := result.Records[0]
userFavoriteCount, _, err = neo4j.GetRecordValue[int64](record, "numberOfFavoritedPosts")
if err != nil {
return userFavoriteCount, err
}
return userFavoriteCount, nil
}

View File

@ -13,9 +13,10 @@ import (
func ScrapeUser(ctx context.Context, graphConnection logic.GraphConnection, client *e621.Client, username string) error { func ScrapeUser(ctx context.Context, graphConnection logic.GraphConnection, client *e621.Client, username string) error {
var err error var err error
scrapeTime := time.Now()
e621User, err := client.GetUserByName(username).Execute() e621User, err := client.GetUserByName(username).Execute()
if err != nil { if err != nil {
log.Info(err)
return err return err
} }
@ -28,107 +29,134 @@ func ScrapeUser(ctx context.Context, graphConnection logic.GraphConnection, clie
return nil return nil
} }
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
}).Info("service: processing user")
err = graphConnection.UploadUser(ctx, e621User) err = graphConnection.UploadUser(ctx, e621User)
if err != nil { if err != nil {
log.Fatal(err) return err
} }
currentDBFavCount, err := graphConnection.GetUserFavoriteCount(ctx, e621User.ID)
if err != nil {
return err
}
favoriteBuilder, err := client.GetFavoritesForUser(e621User.Name)
if err != nil {
return err
}
if currentDBFavCount > e621User.FavoriteCount {
//TODO: IMPLEMENT USER MARKED FOR DELETED FAVS
log.WithFields(log.Fields{ log.WithFields(log.Fields{
"e621_username": e621User.Name, "e621_username": e621User.Name,
"e621_user_id": e621User.ID, "e621_user_id": e621User.ID,
}).Info("service: start processing favorites") "e621_current_db_favorite_count": currentDBFavCount,
start := time.Now() "e621_user_favorite_count": e621User.FavoriteCount,
}).Debug("service: user has favorites deleted")
}
e621FavoritesBuilder := client.GetFavoritesBuilder().SetUserID(e621User.ID) var pageIndex = 1
e621Favorites, err := client.GetAllFavoritesForUser(e621FavoritesBuilder) for currentDBFavCount < e621User.FavoriteCount {
// Uploads all Tags, Posts as Nodes to Neo4j favorites, err := favoriteBuilder.Page(pageIndex).Execute()
for i, post := range e621Favorites { if err != nil {
if exists, err := graphConnection.CheckUserToPostLink(ctx, post.ID, e621User.ID); err == nil && exists { return err
log.WithFields(log.Fields{ }
"e621_username": e621User.Name,
"e621_user_id": e621User.ID, if len(favorites) <= 0 {
"last_post_id": post.ID, return nil
}).Info("service: no new favorites found") }
for _, favorite := range favorites {
if currentDBFavCount == e621User.FavoriteCount {
break break
} else if err != nil {
return err
} }
start = time.Now() isFaved, err := graphConnection.CheckUserToPostLink(ctx, favorite.ID, e621User.ID)
err = uploadNodes(ctx, graphConnection, post)
if err != nil { if err != nil {
return err return err
} }
if !isFaved {
err = uploadDataToDB(ctx, graphConnection, favorite, e621User)
if err != nil {
return err
}
currentDBFavCount++
}
}
pageIndex++
}
log.WithFields(log.Fields{ log.WithFields(log.Fields{
"e621_username": e621User.Name, "e621_username": e621User.Name,
"e621_user_id": e621User.ID, "e621_user_id": e621User.ID,
"post_number": i, "post_amount": e621User.FavoriteCount,
"post_amount": len(e621Favorites), "scrape_time": time.Since(scrapeTime),
"post_id": post.ID,
"upload_time": time.Since(start),
}).Debug("service: uploading post")
start := time.Now()
err = uploadPostToUserRelationship(ctx, graphConnection, post, e621User)
if err != nil {
log.Fatal(err)
return err
}
err = uploadSourceTagRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
err = uploadGeneralTagRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
err = uploadCharacterTagtRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
err = uploadCopyrightTagRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
err = uploadArtistTagRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"post_number": i,
"post_amount": len(e621Favorites),
"post_id": post.ID,
"upload_time": time.Since(start),
}).Debug("service: making relationship")
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"post_amount": len(e621Favorites),
"scrape_time": time.Since(start),
}).Info("service: finished processing favorites") }).Info("service: finished processing favorites")
return nil return nil
} }
func uploadDataToDB(ctx context.Context, graphConnection logic.GraphConnection, favorite model.Post, e621User model.User) error {
start := time.Now()
err := uploadNodes(ctx, graphConnection, favorite)
if err != nil {
return err
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"post_id": favorite.ID,
"upload_time": time.Since(start),
}).Debug("service: uploaded post")
start = time.Now()
err = uploadPostToUserRelationship(ctx, graphConnection, favorite, e621User)
if err != nil {
log.Fatal(err)
return err
}
err = uploadSourceTagRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
err = uploadGeneralTagRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
err = uploadCharacterTagtRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
err = uploadCopyrightTagRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
err = uploadArtistTagRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"post_id": favorite.ID,
"upload_time": time.Since(start),
}).Debug("service: made relationship")
return nil
}
// uploadNodes uploads the post to the database and creates the nodes // uploadNodes uploads the post to the database and creates the nodes
func uploadNodes(ctx context.Context, graphConnection logic.GraphConnection, post model.Post) error { func uploadNodes(ctx context.Context, graphConnection logic.GraphConnection, post model.Post) error {

View File

@ -15,4 +15,5 @@ type GraphConnection interface {
EstablishPostToSourceLink(ctx context.Context, e621PostID model.PostID, sourceURL string) error EstablishPostToSourceLink(ctx context.Context, e621PostID model.PostID, sourceURL string) error
EstablishUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) error EstablishUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) error
CheckUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) (bool, error) CheckUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) (bool, error)
GetUserFavoriteCount(ctx context.Context, userID model.UserID) (int64, error)
} }